Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "3.200",
%%%     date            = "31 August 2024",
%%%     time            = "15:25:45 MDT",
%%%     filename        = "multithreading.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "08899 60199 287616 2944248",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "multithreading; OpenMP; POSIX; pthreads;
%%%                        threads; UNIX; Win32; Windows NT",
%%%     license         = "public domain",
%%%     supported       = "no",
%%%     docstring       = "This bibliography covers publications about
%%%                        multithreaded programming.
%%%
%%%                        At version 3.200, the year coverage looked
%%%                        like this:
%%%
%%%                             1973 (   1)    1990 (  16)    2007 (  48)
%%%                             1974 (   0)    1991 (  42)    2008 (  59)
%%%                             1975 (   0)    1992 (  49)    2009 (  62)
%%%                             1976 (   0)    1993 (  47)    2010 (  58)
%%%                             1977 (   0)    1994 (  71)    2011 (  37)
%%%                             1978 (   0)    1995 (  90)    2012 (  70)
%%%                             1979 (   0)    1996 (  82)    2013 (  40)
%%%                             1980 (   1)    1997 (  86)    2014 (  56)
%%%                             1981 (   0)    1998 (  86)    2015 (  56)
%%%                             1982 (   0)    1999 (  70)    2016 (  56)
%%%                             1983 (   0)    2000 (  82)    2017 (  48)
%%%                             1984 (   0)    2001 (  59)    2018 (  40)
%%%                             1985 (   0)    2002 (  66)    2019 (  42)
%%%                             1986 (   1)    2003 (  60)    2020 (  19)
%%%                             1987 (   2)    2004 (  37)    2021 (  18)
%%%                             1988 (   2)    2005 (  31)    2022 (  17)
%%%                             1989 (  15)    2006 (  50)
%%%
%%%                             Article:       1442
%%%                             Book:            53
%%%                             InBook:           1
%%%                             InCollection:     1
%%%                             InProceedings:  103
%%%                             Manual:           4
%%%                             MastersThesis:   37
%%%                             Misc:             1
%%%                             PhdThesis:       22
%%%                             Proceedings:     69
%%%                             TechReport:      39
%%%
%%%                             Total entries: 1772
%%%
%%%                        OpenMP is an ``Application Program Interface
%%%                        (API) supports multi-platform shared-memory
%%%                        parallel programming in C/C++ and Fortran on
%%%                        all architectures, including Unix platforms
%%%                        and Windows NT platforms. Jointly defined by
%%%                        a group of major computer hardware and
%%%                        software vendors, OpenMP is a portable,
%%%                        scalable model that gives shared-memory
%%%                        parallel programmers a simple and flexible
%%%                        interface for developing parallel
%%%                        applications for platforms ranging from the
%%%                        desktop to the supercomputer.''  [from the
%%%                        OpenMP Web site].  For details, visit
%%%
%%%                            http://www.openmp.org/
%%%
%%%                        At least two vendors, Kuck & Associates (KAI),
%%%
%%%                            http://www.kai.com/parallel/openmp.html
%%%
%%%                        and the Portland Group, Inc. (PGI)
%%%
%%%                            http://www.pgroup.com/ppro_docs/pgiws_ug/pgi31u11.htm
%%%                            http://www.pgroup.com/ppro_docs/pgiws_ug/pgi31u12.htm
%%%
%%%                        provide extensive support of OpenMP.
%%%
%%%                        BibTeX citation tags are uniformly chosen as
%%%                        name:year:abbrev, where name is the family
%%%                        name of the first author or editor, year is a
%%%                        4-digit number, and abbrev is a 3-letter
%%%                        condensation of important title words.
%%%                        Citation tags were automatically generated by
%%%                        software developed for the BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted
%%%                        first by ascending year, and within each
%%%                        year, alphabetically by author or editor,
%%%                        and then, if necessary, by the 3-letter
%%%                        abbreviation at the end of the BibTeX
%%%                        citation tag, using the bibsort -byyear
%%%                        utility.  Year order has been chosen to
%%%                        make it easier to identify the most recent
%%%                        work.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{
    "\ifx \undefined \pkg       \def \pkg       #1{{{\tt #1}}} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Institution abbreviations:
@String{inst-CSC                = "Center for Scientific Computing,
                                  Department of Mathematics, University of
                                  Utah"}
@String{inst-CSC:adr            = "Salt Lake City, UT 84112, USA"}

@String{inst-CSU                = "Colorado State University"}
@String{inst-CSU:adr            = "Fort Collins, CO, USA"}

@String{inst-NLRC               = "NASA Langley Research Center"}
@String{inst-NLRC:adr           = "Hampton, VA, USA"}

@String{inst-SRC-IDA            = "Supercomputing Research Center: IDA"}
@String{inst-SRC-IDA:adr        = "Lanham, MD, USA"}

@String{inst-U-MARYLAND         = "University of Maryland"}
@String{inst-U-MARYLAND:adr     = "College Park, MD, USA"}

@String{inst-UCB-EECS           = "Department of Electrical Engineering and
                                  Computer Science, University of California,
                                  Berkeley"}
@String{inst-UCB-EECS:adr       = "Berkeley, CA, USA"}

@String{inst-UIUC-CSRD          = "University of Illinois at Urbana-Champaign,
                                  Center for Supercomputing Research and
                                  Development"}
@String{inst-UIUC-CSRD:adr      = "Urbana, IL 61801, USA"}

@String{inst-UT-CS              = "Department of Computer Science, University of
                                  Tennessee, Knoxville"}
@String{inst-UT-CS:adr          = "Knoxville, TN 37996, USA"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-ACM-COMM-COMP-ALGEBRA = "ACM Communications in Computer Algebra"}

@String{j-ACM-J-EXP-ALGORITHMICS = "ACM Journal of Experimental Algorithmics"}

@String{j-ACTA-INFO             = "Acta Informatica"}

@String{j-ADA-USER              = "Ada User"}

@String{j-ALGORITHMICA          = "Algorithmica"}

@String{j-ALGORITHMS-BASEL      = "Algorithms ({Basel})"}

@String{j-APPL-MATH-COMP        = "Applied Mathematics and Computation"}

@String{j-APPL-NUM-MATH         = "Applied Numerical Mathematics: Transactions
                                  of IMACS"}

@String{j-BYTE                  = "Byte Magazine"}

@String{j-C-PLUS-PLUS-REPORT    = "C++ Report"}

@String{j-CACM                  = "Communications of the ACM"}

@String{j-CCCUJ                 = "C/C++ Users Journal"}

@String{j-CCPE                  = "Concurrency and Computation: Prac\-tice and
                                  Experience"}

@String{j-CG-WORLD              = "Computer Graphics World"}

@String{j-COMP-ARCH-NEWS        = "ACM SIGARCH Computer Architecture News"}

@String{j-COMP-GRAPHICS         = "Computer Graphics"}

@String{j-COMP-J                = "The Computer Journal"}

@String{j-COMP-NET-AMSTERDAM    = "Computer Networks (Amsterdam, Netherlands:
                                  1999)"}

@String{j-COMP-PHYS-COMM        = "Computer Physics Communications"}

@String{j-COMP-SURV             = "ACM Computing Surveys"}

@String{j-COMP-SYS              = "Computing Systems"}

@String{j-COMPUT-MATH-APPL      = "Computers and Mathematics with Applications"}

@String{j-COMPUT-PHYS           = "Computers in Physics"}

@String{j-COMPUT-SCI-ENG        = "Computing in Science and Engineering"}

@String{j-COMPUTER              = "Computer"}

@String{j-COMPUTERS-AND-GRAPHICS = "Computers and Graphics"}

@String{j-COMPUTING             = "Computing"}

@String{j-CPE                   = "Concurrency: Prac\-tice and Experience"}

@String{j-CUJ                   = "C Users Journal"}

@String{j-DATAMATION            = "Datamation"}

@String{j-DDJ                   = "Dr. Dobb's Journal of Software Tools"}

@String{j-DEC-TECH-J            = "Digital Technical Journal"}

@String{j-DISTRIB-COMPUT        = "Distributed Computing"}

@String{j-ELECTRONIK            = "Elektronik"}

@String{j-FORM-ASP-COMPUT       = "Formal Aspects of Computing"}

@String{j-FUND-INFO             = "Fundamenta Informaticae"}

@String{j-FUT-GEN-COMP-SYS      = "Future Generation Computer Systems"}

@String{j-HIGHER-ORDER-SYMB-COMPUT = "Higher-Order and Symbolic Computation"}

@String{j-IBM-JRD               = "IBM Journal of Research and Development"}

@String{j-IBM-SYS-J             = "IBM Systems Journal"}

@String{j-IEEE-CGA              = "IEEE Computer Graphics and Applications"}

@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}

@String{j-IEEE-COMPUT-SCI-ENG   = "IEEE Computational Science \& Engineering"}

@String{j-IEEE-CONCURR          = "IEEE Concurrency"}

@String{j-IEEE-DISTRIB-SYST-ONLINE = "IEEE Distributed Systems Online"}

@String{j-IEEE-INT-SYMP-HIGH-PERF-DIST-COMP-PROC = "IEEE International Symposium
                                  on High Performance Distributed Computing,
                                  Proceedings"}

@String{j-IEEE-MICRO            = "IEEE Micro"}

@String{j-IEEE-PAR-DIST-TECH    = "IEEE parallel and distributed technology:
                                  systems and applications"}

@String{j-IEEE-SOFTWARE         = "IEEE Software"}

@String{j-IEEE-SPECTRUM         = "IEEE Spectrum"}

@String{j-IEEE-TRANS-BIG-DATA   = "IEEE Transactions on Big Data"}

@String{j-IEEE-TRANS-COMPUT     = "IEEE Transactions on Computers"}

@String{j-IEEE-TRANS-PAR-DIST-SYS = "IEEE Transactions on Parallel and
                                  Distributed Systems"}

@String{j-IEEE-TRANS-SOFTW-ENG  = "IEEE Transactions on Software Engineering"}

@String{j-IEEE-TRANS-VIS-COMPUT-GRAPH = "IEEE Transactions on Visualization
                                   and Computer Graphics"}

@String{j-IJHPCA                = "The International Journal of High
                                  Performance Computing Applications"}

@String{j-IJQC                  = "International Journal of Quantum Chemistry"}

@String{j-INFO-PROC-LETT        = "Information Processing Letters"}

@String{j-INT-J-COMPUT-APPL     = "International Journal of Computer
                                   Applications"}

@String{j-INT-J-COMPUT-SYST-SCI-ENG = "International Journal of Computer
                                  Systems Science and Engineering"}

@String{j-INT-J-HIGH-SPEED-COMPUTING = "International Journal of High
                        Speed Computing (IJHSC)"}

@String{j-INT-J-PAR-EMER-DIST-SYS = "International Journal of Parallel,
                               Emergent and Distributed Systems: IJPEDS"}

@String{j-INT-J-PARALLEL-PROG   = "International Journal of Parallel
                                  Programming"}

@String{j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER = "International Journal on
                                  Software Tools for Technology Transfer
                                  (STTT)"}

@String{j-INTEL-TECH-J          = "Intel Technology Journal"}

@String{j-J-ACM                 = "Journal of the ACM"}

@String{j-J-AUTOM-REASON        = "Journal of Automated Reasoning"}

@String{j-J-COMP-SECUR          = "Journal of Computer Security"}

@String{j-J-COMPUT-BIOL         = "Journal of Computational Biology"}

@String{j-J-COMPUT-CHEM         = "Journal of Computational Chemistry"}

@String{j-J-COMPUT-PHYS         = "Journal of Computational Physics"}

@String{j-J-GRAPHICS-TOOLS      = "Journal of Graphics Tools: JGT"}

@String{j-J-GRID-COMP            = "Journal of Grid Computing"}

@String{j-J-OPEN-SOURCE-SOFT    = "Journal of Open Source Software"}

@String{j-J-PAR-DIST-COMP       = "Journal of Parallel and Distributed
                                  Computing"}

@String{j-J-STAT-SOFT           = "Journal of Statistical Software"}

@String{j-J-SUPERCOMPUTING      = "The Journal of Supercomputing"}

@String{j-J-SYMBOLIC-COMP       = "Journal of Symbolic Computation"}

@String{j-J-SYST-SOFTW          = "The Journal of Systems and Software"}

@String{j-J-UCS                 = "J.UCS: Journal of Universal Computer
                                  Science"}

@String{j-JAVA-REPORT           = "{Java} Report: The Source for {Java}
                                  Development"}

@String{j-JAVAWORLD             = "JavaWorld: IDG's magazine for the Java
                                  community"}

@String{j-JERIC                 = "ACM Journal on Educational Resources in
                                  Computing (JERIC)"}

@String{j-JETC                  = "ACM Journal on Emerging Technologies in
                                  Computing Systems (JETC)"}

@String{j-LECT-NOTES-COMP-SCI   = "Lecture Notes in Computer Science"}

@String{j-LINUX-J               = "Linux Journal"}

@String{j-LOGIN                 = ";login: the USENIX Association newsletter"}

@String{j-MICROPROC-MICROSYS    = "Microprocessors and Microsystems"}

@String{j-MICROSOFT-SYS-J       = "Microsoft Systems Journal"}

@String{j-NORDIC-J-COMPUT       = "Nordic Journal of Computing"}

@String{j-NUMER-ALGORITHMS      = "Numerical Algorithms"}

@String{j-ONLINE-CDROM-REV      = "Online \& CDROM review: the international
                                  journal of online \& optical
                                  information systems"}

@String{j-OPEN-SYSTEMS-TODAY    = "Open Systems Today"}

@String{j-OPER-SYS-REV          = "Operating Systems Review"}

@String{j-PACMPL                = "Proceedings of the ACM on Programming
                                   Languages (PACMPL)"}

@String{j-PARALLEL-COMPUTING    = "Parallel Computing"}

@String{j-PARALLEL-DIST-COMP-PRACT = "Parallel and Distributed Computing
                                  Practices"}

@String{j-PARALLEL-PROCESS-LETT = "Parallel Processing Letters"}

@String{j-POMACS                = "Proceedings of the ACM on Measurement and
                                   Analysis of Computing Systems (POMACS)"}

@String{j-PROC-REAL-TIME-SYS-SYMP = "Proceedings --- Real-Time Systems
                                  Symposium"}

@String{j-PROC-VLDB-ENDOWMENT   = "Proceedings of the VLDB Endowment"}

@String{j-QUEUE                 = "ACM Queue: Tomorrow's Computing Today"}

@String{j-REAL-TIME-SYST        = "Real-Time Systems"}

@String{j-SCI-COMPUT-PROGRAM    = "Science of Computer Programming"}

@String{j-SCI-PROG              = "Scientific Programming"}

@String{j-SCPE                  = "Scalable Computing: Practice and Experience"}

@String{j-SIAM-J-COMPUT         = "SIAM Journal on Computing"}

@String{j-SIAM-J-SCI-COMP       = "SIAM Journal on Scientific Computing"}

@String{j-SIGADA-LETTERS        = "ACM SIGADA Ada Letters"}

@String{j-SIGAPP                = "ACM SIGAPP Applied Computing Review"}

@String{j-SIGCSE                = "SIGCSE Bulletin (ACM Special Interest Group
                                  on Computer Science Education)"}

@String{j-SIGMETRICS            = "ACM SIGMETRICS Performance Evaluation
                                  Review"}

@String{j-SIGMICRO              = "ACM SIGMICRO Newsletter"}

@String{j-SIGMOD                = "SIGMOD Record (ACM Special Interest Group
                                  on Management of Data)"}

@String{j-SIGPLAN               = "ACM SIG{\-}PLAN Notices"}

@String{j-SIGSOFT               = "ACM SIGSOFT Software Engineering Notes"}

@String{j-SPE                   = "Soft{\-}ware\emdash Prac{\-}tice and
                                  Experience"}

@String{j-SUPERCOMPUTER         = "Supercomputer"}

@String{j-TACO                  = "ACM Transactions on Architecture and Code
                                  Optimization"}

@String{j-TCBB                  = "IEEE/ACM Transactions on Computational
                                  Biology and Bioinformatics"}

@String{j-TECS                  = "ACM Transactions on Embedded Computing
                                  Systems"}

@String{j-THEOR-COMP-SCI        = "Theoretical Computer Science"}

@String{j-TISSEC                = "ACM Transactions on Information and System
                                  Security"}

@String{j-TIST                 = "ACM Transactions on Intelligent Systems and
                                  Technology (TIST)"}

@String{j-TKDD                  = "ACM Transactions on Knowledge
                                  Discovery from Data (TKDD)"}

@String{j-TOCHI                 = "ACM Transactions on Computer-Human
                                  Interaction"}

@String{j-TOCS                  = "ACM Transactions on Computer Systems"}

@String{j-TOCL                  = "ACM Transactions on Computational Logic"}

@String{j-TODAES                = "ACM Transactions on Design Automation of
                                  Electronic Systems."}

@String{j-TODS                  = "ACM Transactions on Database Systems"}

@String{j-TOG                   = "ACM Transactions on Graphics"}

@String{j-TOIS                  = "ACM Transactions on Information Systems"}

@String{j-TOMACS                = "ACM Transactions on Modeling and
                                  Computer Simulation"}

@String{j-TOMPECS               = "ACM Transactions on Modeling and Performance
                                  Evaluation of Computing Systems (TOMPECS)"}

@String{j-TOMS                  = "ACM Transactions on Mathematical Software"}

@String{j-TOPC                  = "ACM Transactions on Parallel Computing
                                  (TOPC)"}

@String{j-TOPLAS                = "ACM Transactions on Programming Languages
                                  and Systems"}

@String{j-TOSEM                 = "ACM Transactions on Software Engineering
                                  and Methodology"}

@String{j-UNIX-REVIEW           = "UNIX review"}

@String{j-UNIXWORLD-OPEN-COMP   = "UnixWorld's Open Computing"}

@String{j-VLDB-J                = "VLDB Journal: Very Large Data Bases"}

@String{j-WEB-TECHNIQUES        = "Web Techniques"}

@String{j-X-RESOURCE            = "{The X Resource}"}

%%% ====================================================================
%%% Publisher abbreviations:
@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-AP                  = "Academic Press"}
@String{pub-AP:adr              = "New York, USA"}

@String{pub-APRESS              = "Apress"}
@String{pub-APRESS:adr          = "Berkeley, CA, USA"}

@String{pub-AW                  = "Ad{\-d}i{\-s}on-Wes{\-l}ey"}
@String{pub-AW:adr              = "Reading, MA, USA"}

@String{pub-AWDP                = "Ad{\-d}i{\-s}on-Wes{\-l}ey Developers
                                  Press"}
@String{pub-AWDP:adr            = "Reading, MA, USA"}

@String{pub-EYROLLES            = "Editions Eyrolles"}
@String{pub-EYROLLES:adr        = "Paris, France"}

@String{pub-HERMES              = "Hermes"}
@String{pub-HERMES:adr          = "Paris, France"}

@String{pub-IEEE                = "IEEE Computer Society Press"}
@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300, Silver
                                  Spring, MD 20910, USA"}

@String{pub-KLUWER              = "Kluwer Academic Publishers"}
@String{pub-KLUWER:adr          = "Dordrecht, The Netherlands; Boston, MA,
                                  USA"}
@String{pub-LEARNED-INF         = "Learned Information"}
@String{pub-LEARNED-INF:adr     = "Medford, NJ, USA"}

@String{pub-MCGRAW-HILL         = "Mc{\-}Graw-Hill"}
@String{pub-MCGRAW-HILL:adr     = "New York, NY, USA"}

@String{pub-MIT                 = "MIT Press"}
@String{pub-MIT:adr             = "Cambridge, MA, USA"}

@String{pub-MORGAN-KAUFMANN = "Morgan Kaufmann Publishers"}
@String{pub-MORGAN-KAUFMANN:adr = "Los Altos, CA 94022, USA"}
@String{pub-MORGAN-KAUFMANN:adrnew = "2929 Campus Drive, Suite 260, San
                                  Mateo, CA 94403, USA"}

@String{pub-NO-STARCH           = "No Starch Press"}
@String{pub-NO-STARCH:adr       = "San Francisco, CA, USA"}

@String{pub-NTIS                = "National Technical Information Service"}
@String{pub-NTIS:adr            = "Washington, DC, USA"}

@String{pub-ORA                 = "O'Reilly \& Associates, Inc."}
@String{pub-ORA:adr             = "981 Chestnut Street, Newton, MA 02164, USA"}

@String{pub-ORA-MEDIA           = "O'Reilly Media, Inc."}
@String{pub-ORA-MEDIA:adr       = "1005 Gravenstein Highway North, Sebastopol,
                                  CA 95472, USA"}

@String{pub-PACKT               = "Packt Publishing"}
@String{pub-PACKT:adr           = "Birmingham, UK"}

@String{pub-PH                  = "Pren{\-}tice-Hall"}
@String{pub-PH:adr              = "Englewood Cliffs, NJ 07632, USA"}

@String{pub-PHI                 = "Pren{\-}tice-Hall International"}
@String{pub-PHI:adr             = "Englewood Cliffs, NJ 07632, USA"}

@String{pub-PHPTR               = "P T R Pren{\-}tice-Hall"}
@String{pub-PHPTR:adr           = "Englewood Cliffs, NJ 07632, USA"}

@String{pub-SAMS                = "Howard W. Sams"}
@String{pub-SAMS:adr            = "Indianapolis, IN 46268, USA"}

@String{pub-SUN                 = "Sun Microsystems"}
@String{pub-SUN:adr             = "2550 Garcia Avenue, Mountain View, CA
                                  94043, USA"}

@String{pub-SUN-MICROSYSTEMS-PRESS = "Sun Microsystems Press"}
@String{pub-SUN-MICROSYSTEMS-PRESS:adr = "Palo Alto, CA, USA"}

@String{pub-SUNSOFT             = "SunSoft Press"}
@String{pub-SUNSOFT:adr         = "Mountainview, CA, USA"}

@String{pub-SV                  = "Spring{\-}er-Ver{\-}lag"}
@String{pub-SV:adr              = "Berlin, Germany~/ Heidelberg,
                                  Germany~/ London, UK~/ etc."}

@String{pub-UKUUG               = "UK Unix Users Group"}
@String{pub-UKUUG:adr           = "Buntingford, Herts, UK"}

@String{pub-USENIX              = "USENIX Association"}
@String{pub-USENIX:adr          = "Berkeley, CA, USA"}

@String{pub-WILEY               = "John Wiley and Sons"}
@String{pub-WILEY:adr           = "New York, NY, USA; London, UK; Sydney,
                                  Australia"}

@String{pub-WORLD-SCI           = "World Scientific Publishing Co."}
@String{pub-WORLD-SCI:adr       = "Singapore; Philadelphia, PA, USA; River
                                  Edge, NJ, USA"}

%%% ====================================================================
%%% Series abbreviations:
@String{ser-LNCS                = "Lecture Notes in Computer Science"}

%%% ====================================================================
%%% Bibliography entries, sorted by year, and then by citation label,
%%% with ``bibsort -byyear'':
@Article{Bettcher:1973:TSR,
  author =       "C. W. Bettcher",
  title =        "Thread standardization and relative cost",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "1",
  pages =        "9--9",
  month =        jan,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "This is a reprint of an article published in the {\em
                 Journal of the Society of Automotive Engineers}, Volume
                 XVIII, Number 2, p. 131, February 1926, about the cost
                 of the lack of standardization of screw threads. {\em
                 Computer Architecture News\/} Editor-in-Chief Caxton C.
                 Foster has added a hand-written note ``of course, there
                 is no message here for {\em us}.''",
}

@Article{Smith:1980:ASD,
  author =       "Connie Smith and J. C. Browne",
  title =        "Aspects of software design analysis: {Concurrency} and
                 blocking",
  journal =      j-SIGMETRICS,
  volume =       "9",
  number =       "2",
  pages =        "245--253",
  month =        "Summer",
  year =         "1980",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1009375.806169",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 10:54:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper extends previous work on development of a
                 methodology for the prediction of the performance of
                 computer software systems from design level
                 specifications and continuing through implementation.
                 The effects of synchronized behavior, such as results
                 from data reservation in multi-thread executions of
                 data base systems, and competition for host system
                 resources are incorporated. The previous methodology
                 uses hierarchical graphs to represent the execution of
                 software on some host computer system (or on some
                 abstract machine). Performance metrics such as response
                 time were obtained from analysis of these graphs
                 assuming execution of a single copy on a dedicated
                 host. This paper discusses the mapping of these
                 execution graphs upon queueing network models of the
                 host computing environment to yield performance metric
                 estimates for more complex and realistic processing
                 environments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Jonak:1986:EFL,
  author =       "J. E. Jonak",
  title =        "Experience with a {FORTH}-like language",
  journal =      j-SIGPLAN,
  volume =       "21",
  number =       "2",
  pages =        "27--36",
  month =        feb,
  year =         "1986",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:14:55 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110 (Systems analysis and programming); C6140D
                 (High level languages)",
  corpsource =   "Sperry Network Syst., London, UK",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "FORTH; languages; programming; threaded code
                 language",
  pubcountry =   "USA A03",
  subject =      "D.3.2 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, FORTH",
  treatment =    "P Practical",
}

@Book{McJones:1987:EUS,
  author =       "Paul R. McJones and Garret Frederick Swart",
  title =        "Evolving the {UNIX} system interface to support
                 multithreaded programs: The {Topaz Operating System}
                 programmer's manual",
  volume =       "21",
  publisher =    "Digital Systems Research Center",
  address =      "Palo Alto, CA, USA",
  pages =        "100",
  day =          "28",
  month =        sep,
  year =         "1987",
  LCCN =         "QA76.76.O63M42 1987",
  bibdate =      "Fri Aug 7 08:29:38 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Systems Research Center",
  acknowledgement = ack-nhfb,
  keywords =     "computer networks; Computer networks; electronic data
                 processing -- distributed processing; Electronic data
                 processing -- Distributed processing; multithreaded
                 operating system interface -- Topaz operating;
                 Operating systems (Computers); operating systems
                 (computers); system; UNIX (computer file); UNIX
                 (Computer operating system)",
}

@Article{Tanner:1987:MTI,
  author =       "P. P. Tanner",
  title =        "Multi-thread input",
  journal =      j-COMP-GRAPHICS,
  volume =       "21",
  number =       "2",
  pages =        "142--145",
  month =        apr,
  year =         "1987",
  CODEN =        "CGRADI, CPGPBZ",
  ISSN =         "0097-8930 (print), 1558-4569 (electronic)",
  ISSN-L =       "0097-8930",
  bibdate =      "Tue Mar 12 17:52:38 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J166",
}

@Article{Gilbert:1988:DVN,
  author =       "P. D. Gilbert",
  title =        "Development of the {VAX NOTES} system",
  journal =      j-DEC-TECH-J,
  volume =       "1",
  number =       "6",
  pages =        "117--124",
  month =        feb,
  year =         "1988",
  CODEN =        "DTJOEL",
  ISSN =         "0898-901X",
  bibdate =      "Thu Mar 20 18:15:43 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6110B (Software engineering techniques); C7410F
                 (Communications)",
  corpsource =   "Digital Equipment Corp., Hudson, MA, USA",
  fjournal =     "Digital Technical Journal",
  keywords =     "callable interface; communications tool; computer
                 conferencing; DEC; DEC computers; discussions; human
                 factors; human-factors engineering; interfaces; medium;
                 multiprogramming; multitasking; multithreaded server;
                 online; program; program testing; software engineering;
                 storage; technical writer; teleconferencing; testing;
                 user; user interface; VAX NOTES",
  treatment =    "P Practical",
}

@Article{Halstead:1988:MMP,
  author =       "R. H. {Halstead, Jr.} and T. Fujita",
  title =        "{MASA}: a multithreaded processor architecture for
                 parallel symbolic computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "443--451",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@TechReport{Agarwal:1989:PTM,
  author =       "Anant Agarwal",
  title =        "Performance tradeoffs in multithreaded processors",
  number =       "89-566",
  institution =  "Massachusetts Institute of Technology, Microsystems
                 Program Office",
  address =      "Cambridge, MA, USA",
  pages =        "30",
  year =         "1989",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "VLSI memo",
  acknowledgement = ack-nhfb,
}

@Article{Amamiya:1989:DFC,
  author =       "M. Amamiya",
  title =        "Data Flow Computing and Parallel Reduction Machine",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "4",
  number =       "??",
  pages =        "53--67",
  month =        "????",
  year =         "1989",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 18:37:19 2002",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Compiler/Functional.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  keywords =     "functional cell toke flow multi-thread control flow
                 architecture",
}

@TechReport{Birrell:1989:IPT,
  author =       "Andrew D. Birrell",
  title =        "An introduction to programming with threads",
  type =         "SRC reports",
  number =       "35",
  institution =  "Digital Systems Research Center",
  address =      "Palo Alto, CA, USA",
  pages =        "35",
  day =          "6",
  month =        jan,
  year =         "1989",
  LCCN =         "QA76.6.B5729 1989",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "parallel programming (computer science);
                 synchronization",
}

@Article{Briot:1989:OAS,
  author =       "Jean-Pierre Briot",
  title =        "From objects to actors: study of a limited symbiosis
                 in {Smalltalk-80}",
  journal =      j-SIGPLAN,
  volume =       "24",
  number =       "4",
  pages =        "69--72",
  month =        apr,
  year =         "1989",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:15:37 MST 2003",
  bibsource =    "Compendex database; http://portal.acm.org/;
                 http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p69-briot/",
  abstract =     "In this paper we describe an implementation of actors
                 in Smalltalk-80, named Actalk. This attempt is designed
                 as a minimal extension preserving the Smalltalk-80
                 language. Actors are active and autonomous objects, as
                 opposed to standard passive Smalltalk-80 objects. An
                 actor is built from a standard Smalltalk-80 object by
                 associating a process with it and by serializing the
                 messages it could receive into a queue. We will study
                 the cohabitation and synergy between the two models of
                 computations: transfer of active messages (message and
                 thread of activity) between passive objects, and
                 exchange of passive messages between active objects. We
                 propose a sketch of methodology in order to have a safe
                 combination between these two programming paradigms.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ Paris VI",
  affiliationaddress = "Paris, Fr",
  classification = "723",
  conference =   "Proceedings of the ACM SIGPLAN Workshop on
                 Object-Based Concurrent Programming",
  confname =     "Proceedings of the ACM SIGPLAN workshop on
                 Object-based concurrent programming, September 26--27
                 1988, San Diego, CA",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  journalabr =   "SIGPLAN Not",
  keywords =     "Actor Based Systems; Computer Metatheory--Programming
                 Theory; Computer Programming Languages; Concurrent
                 Programming; Design; design; languages; Object-Based
                 Programming; Smalltalk-80",
  meetingaddress = "San Diego, CA, USA",
  meetingdate =  "Sep 26--27 1988",
  meetingdate2 = "09/26--27/88",
  subject =      "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Smalltalk-80. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Concurrency.",
}

@Article{Caromel:1989:GMC,
  author =       "Denis Caromel",
  title =        "A general model for concurrent and distributed
                 object-oriented programming",
  journal =      j-SIGPLAN,
  volume =       "24",
  number =       "4",
  pages =        "102--104",
  month =        apr,
  year =         "1989",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:15:37 MST 2003",
  bibsource =    "Compendex database; http://portal.acm.org/;
                 http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p102-caromel/",
  abstract =     "This paper presents a general model supporting
                 object-oriented programming in concurrent as well as
                 distributed environments. The model combines the
                 advantages of remote procedure calls with those of
                 message passing. It relies on the following concepts:
                 All objects are not active but the active entities are
                 objects, Asynchronous Message Passing with Data-driven
                 synchronization, and Service mechanism allowing an
                 explicit thread of control.",
  acknowledgement = ack-nhfb,
  affiliation =  "CNRS",
  affiliationaddress = "Vandoeuvres-les-Nancy, Fr",
  classification = "722; 723",
  conference =   "Proceedings of the ACM SIGPLAN Workshop on
                 Object-Based Concurrent Programming",
  confname =     "Proceedings of the ACM SIGPLAN workshop on
                 Object-based concurrent programming, September 26--27
                 1988, San Diego, CA",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  journalabr =   "SIGPLAN Not",
  keywords =     "Computer Systems Programming; Computer Systems,
                 Digital--Distributed; Concurrent Programming; design;
                 Multiprocessing Programs; Object-Oriented Programming",
  meetingaddress = "San Diego, CA, USA",
  meetingdate =  "Sep 26--27 1988",
  meetingdate2 = "09/26--27/88",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming. {\bf D.1.m} Software,
                 PROGRAMMING TECHNIQUES, Miscellaneous. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Distributed systems. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency.",
}

@MastersThesis{CarrerasVaquer:1989:APE,
  author =       "Carlos {Carreras Vaquer}",
  title =        "Architecture and performance evaluation of a
                 multithreaded cache design",
  type =         "Thesis ({M.S. in Engineering})",
  school =       "University of Texas at Austin",
  address =      "Austin, TX, USA",
  pages =        "xii + 108",
  year =         "1989",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Cache memory; Computer architecture; Computer storage
                 devices; Integrated circuits -- Very large scale
                 integration; Microprocessors",
}

@TechReport{Caswell:1989:IMD,
  author =       "Deborah L. Caswell and David L. Black",
  title =        "Implementing a {Mach} debugger for multithreaded
                 applications",
  type =         "Research paper",
  number =       "CMU-CS-89-154",
  institution =  "Carnegie Mellon University, Computer Science Dept.",
  address =      "Pittsburgh, PA, USA",
  pages =        "13",
  month =        nov,
  year =         "1989",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "To appear in the Conference Proceedings of Winter 1990
                 USENIX Technical Conference and Exhibition, Washington,
                 DC, January, 1990.",
  abstract =     "Multiple threads of control add new challenges to the
                 task of application debugging, and require the
                 development of new debuggers to meet these challenges.
                 This paper describes the design and implementation of
                 modifications to an existing debugger (gdb) for
                 debugging multithreaded applications under the Mach
                 operating system. It also describes the operating
                 system facilities that support it. Although certain
                 implementation details are specific to Mach, the
                 underlying design principles are applicable to other
                 systems that support threads in a Unix compatible
                 environment.",
  acknowledgement = ack-nhfb,
  annote =       "Supported by the Space and Naval Warfare Systems
                 Command.",
  keywords =     "Debugging in computer science -- Computer programs",
}

@InProceedings{Korty:1989:SLL,
  author =       "Joseph A. Korty",
  title =        "{Sema}: a {Lint-like} Tool for Analyzing Semaphore
                 Usage in a Multithreaded {UNIX} Kernel",
  crossref =     "USENIX:1989:PWU",
  institution =  "MODCOMP",
  pages =        "113--123",
  month =        "Winter",
  year =         "1989",
  bibdate =      "Wed Aug 13 10:48:45 MDT 1997",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix1980.bib;
                 http://www.usenix.org/cgi-bin/sortbib.pl?-sA",
  acknowledgement = ack-nhfb,
  affiliation =  "MODCOMP",
}

@Article{Massalin:1989:TIO,
  author =       "H. Massalin and C. Pu",
  title =        "Threads and input\slash output in the synthesis
                 kernel",
  journal =      j-OPER-SYS-REV,
  volume =       "23",
  number =       "5",
  pages =        "191--201",
  month =        dec,
  year =         "1989",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 12:47:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@InProceedings{McJones:1989:EUS,
  author =       "Paul R. McJones and Garret F. Swart",
  title =        "Evolving the {UNIX} System Interface to Support
                 Multithreaded Programs",
  crossref =     "USENIX:1989:PWU",
  pages =        "393--404",
  month =        "Winter",
  year =         "1989",
  bibdate =      "Fri Oct 18 07:24:24 MDT 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "DEC Systems Research Center",
}

@MastersThesis{Plyler:1989:AMC,
  author =       "Kevin Brian Plyler",
  title =        "Adding multithreaded capabilities to the process
                 manager of the {BIGSAM} distributed operating system",
  type =         "Thesis ({M.S.})",
  school =       "Arizona State University",
  address =      "Tempe, AZ, USA",
  pages =        "x + 105 + 2",
  year =         "1989",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Electronic data processing -- Distributed processing;
                 Multiprocessors; Operating systems (Computers)",
}

@InProceedings{Rashid:1989:MFO,
  author =       "R. Rashid and R. Baron and A. Forin and D. Golub and
                 M. Jones and D. Orr and R. Sanzi",
  title =        "{Mach}: a foundation for open systems (operating
                 systems)",
  crossref =     "IEEE:1989:WOS",
  pages =        "109--113",
  year =         "1989",
  bibdate =      "Sat Sep 28 20:21:01 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie-Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6110B (Software engineering techniques); C6150J
                 (Operating systems)",
  keywords =     "Hardware resources; Mach kernel; Multiserver Unix;
                 Multithreaded Unix server; Operating system; OS
                 emulation; Software development",
  thesaurus =    "File servers; Open systems; Operating systems
                 [computers]; Software engineering; Unix",
}

@Article{Schonberg:1989:FDA,
  author =       "Edith Schonberg",
  title =        "On-the-fly detection of access anomalies",
  journal =      j-SIGPLAN,
  volume =       "24",
  number =       "7",
  pages =        "285--297",
  month =        jul,
  year =         "1989",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:15:41 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/73141/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/73141/p285-schonberg/",
  abstract =     "Access anomalies are a common class of bugs in
                 shared-memory parallel programs. An access anomaly
                 occurs when two concurrent execution threads both write
                 (or one thread reads and the other writes) the same
                 shared memory location without coordination. Approaches
                 to the detection of access anomalies include static
                 analysis, post-mortem trace analysis, and on-the-fly
                 monitoring. A general on-the-fly algorithm for access
                 anomaly detection is presented, which can be applied to
                 programs with both nested fork-join and synchronization
                 operations. The advantage of on-the-fly detection over
                 post-mortem analysis is that the amount of storage used
                 can be greatly reduced by data compression techniques
                 and by discarding information as soon as it becomes
                 obsolete. In the algorithm presented, the amount of
                 storage required at any time depends only on the number
                 V of shared variables being monitored and the number N
                 of threads, not on the number of synchronizations. Data
                 compression is achieved by the use of two techniques
                 called merging and subtraction. Upper bounds on storage
                 are shown to be V \$MUL N${}^2$ for merging and V \$MUL
                 N for subtraction.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "New York, NY, USA",
  annote =       "Published as part of the Proceedings of PLDI'89.",
  classification = "722; 723",
  conference =   "Proceedings of the SIGPLAN '89 Conference on
                 Programming Language Design and Implementation",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  journalabr =   "SIGPLAN Not",
  keywords =     "Access Anomalies; algorithms; Computer Operating
                 Systems; Computer Programming Languages--Design;
                 Computer Systems, Digital--Parallel Processing;
                 languages; Parallel Programs; Program Processors",
  meetingaddress = "Portland, OR, USA",
  meetingdate =  "Jun 21--23 1989",
  meetingdate2 = "06/21--23/89",
  sponsor =      "ACM, Special Interest Group on Programming Languages,
                 New York; SS NY, USA",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming. {\bf D.3.2} Software,
                 PROGRAMMING LANGUAGES, Language Classifications, Ada.
                 {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design
                 Tools and Techniques, Flow charts.",
}

@InProceedings{Caswell:1990:IMD,
  author =       "D. Caswell and D. Black",
  title =        "Implementing a {Mach} debugger for multithreaded
                 applications",
  crossref =     "Anonymous:1990:PWU",
  pages =        "25--39",
  year =         "1990",
  bibdate =      "Sat Sep 28 20:03:34 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Hewlett Packard Labs., Palo Alto, CA, USA",
  classification = "C6150G (Diagnostic, testing, debugging and
                 evaluating systems); C6150J (Operating systems)",
  keywords =     "Application debugging; Mach debugger; Mach operating
                 system; Multithreaded applications; Operating system
                 facilities; Underlying design principles; Unix
                 compatible environment",
  thesaurus =    "Operating systems [computers]; Program debugging;
                 Unix",
}

@Article{Colvin:1990:CTS,
  author =       "Gregory Colvin",
  title =        "{CUG306} Thread and Synapsys",
  journal =      j-CUJ,
  volume =       "8",
  type =         "CUG New Release",
  number =       "3",
  pages =        "131--??",
  month =        mar,
  year =         "1990",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@Article{Colvin:1990:MLT,
  author =       "Gregory Colvin",
  title =        "Multitasking With Lightweight Threads",
  journal =      j-CUJ,
  volume =       "8",
  number =       "3",
  pages =        "55--??",
  month =        mar,
  year =         "1990",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@Article{Eggers:1990:TEI,
  author =       "S. J. Eggers and David R. Keppel and Eric J. Koldinger
                 and Henry M. Levy",
  title =        "Techniques for efficient inline tracing on a
                 shared-memory multiprocessor",
  journal =      j-SIGMETRICS,
  volume =       "18",
  number =       "1",
  pages =        "37--47",
  month =        may,
  year =         "1990",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/98457.98501",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:09:08 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "While much current research concerns multiprocessor
                 design, few traces of parallel programs are available
                 for analyzing the effect of design trade-offs. Existing
                 trace collection methods have serious drawbacks:
                 trap-driven methods often slow down program execution
                 by more than 1000 times, significantly perturbing
                 program behavior; microcode modification is faster, but
                 the technique is neither general nor portable. This
                 paper describes a new tool, called MPTRACE, for
                 collecting traces of multithreaded parallel programs
                 executing on shared-memory multiprocessors. MPTRACE
                 requires no hardware or microcode modification; it
                 collects complete program traces; it is portable; and
                 it reduces execution-time dilation to less than a
                 factor 3. MPTRACE is based on inline tracing, in which
                 a program is automatically modified to produce trace
                 information as it executes. We show how the use of
                 compiler flow analysis techniques can reduce the amount
                 of data collected and therefore the runtime dilation of
                 the traced program. We also discuss problematic issues
                 concerning buffering and writing of trace data on a
                 multiprocessor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Faust:1990:POO,
  author =       "John E. Faust and Henry M. Levy",
  title =        "The performance of an object-oriented threads
                 package",
  journal =      j-SIGPLAN,
  volume =       "25",
  number =       "10",
  pages =        "278--288",
  month =        oct,
  year =         "1990",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:15:57 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Gonzalez:1990:MSC,
  author =       "Dean W. Gonzalez",
  title =        "Multitasking Software Components",
  journal =      j-SIGADA-LETTERS,
  volume =       "10",
  number =       "1",
  pages =        "92--96",
  month =        jan # "\slash " # feb,
  year =         "1990",
  CODEN =        "AALEE5",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Thu Sep 28 07:33:23 MDT 2000",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 http://www.adahome.com/Resources/Bibliography/articles.ref;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6110B (Software engineering techniques); C6120 (File
                 organisation)",
  fjournal =     "ACM SIGADA Ada Letters",
  keywords =     "Ada; Ada parameter passing semantics; concurrency,
                 tasking, reuse; concurrent forms; data integrity; data
                 structure manipulation routines; data structures;
                 multiple; parallel programming; reusability; semaphore
                 calls; software; threads of control",
  treatment =    "P Practical",
}

@InProceedings{Hansen:1990:EPA,
  author =       "G. J. Hansen and C. A. Linthicum and G. Brooks",
  title =        "Experience with a performance analyzer for
                 multithreaded applications",
  crossref =     "IEEE:1990:PSN",
  pages =        "124--131",
  year =         "1990",
  bibdate =      "Wed Apr 15 18:34:48 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C5470 (Performance evaluation and testing); C6150E
                 (General utility programs); C6150G (Diagnostic,
                 testing, debugging and evaluating systems)",
  corpsource =   "CONVEX Comput. Corp., Richardson, TX, USA",
  keywords =     "CONVEX C200 series computers; Convex OS V8.0; CONVEX
                 performance analyzer, CX/sub pa/; loops;
                 multiprocessing systems; multithreaded applications;
                 operating system facilities; parallel code monitoring;
                 performance evaluation; profiling data; profiling
                 information; time-sharing environment; time-sharing
                 systems; Unix; UNIX based operating system",
  sponsororg =   "IEEE; ACM; Lawrence Livermore Nat. Lab.; Los Alamos
                 Nat. Lab.; NASA Ames Res. Center; Nat. Center Atmos.
                 Res.; NSF; SIAM; Supercomput. Res. Center",
  treatment =    "P Practical; X Experimental",
}

@Article{Miastkowski:1990:PGG,
  author =       "Stan Miastkowski",
  title =        "{PC GUIs} Go Head to Head",
  journal =      j-BYTE,
  volume =       "15",
  number =       "11",
  pages =        "82--87",
  month =        "Fall",
  year =         "1990",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Thu Sep 12 18:39:30 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6130B (Graphics techniques); C6150J (Operating
                 systems); C6180 (User interfaces)",
  fjournal =     "BYTE Magazine",
  keywords =     "Graphical DOS shell; Multithreading operating system;
                 OS/2; PC GUIs; User interface differences; Windows
                 3.0",
  thesaurus =    "Computer graphics; Operating systems [computers]; User
                 interfaces",
}

@Article{Nordstrom:1990:TL,
  author =       "D. J. Nordstrom",
  title =        "Threading {Lisp}",
  journal =      j-SIGPLAN,
  volume =       "25",
  number =       "2",
  pages =        "17--24",
  month =        feb,
  year =         "1990",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:15:50 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@TechReport{Polychronopoulos:1990:ASC,
  author =       "C. D. (Constantine D.) Polychronopoulos",
  title =        "Auto scheduling: control flow and data flow come
                 together",
  type =         "Technical Report",
  number =       "CSRD 1058",
  institution =  inst-UIUC-CSRD,
  address =      inst-UIUC-CSRD:adr,
  pages =        "28",
  month =        dec,
  year =         "1990",
  bibdate =      "Fri Aug 30 08:01:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper presents a framework we term
                 auto-scheduling, which brings together the control flow
                 and data flow models by combining most of the
                 advantages and excluding the major disadvantages of the
                 two familiar models. Auto-scheduling can be viewed
                 either as an abstract architectural model or as a
                 parallel program compilation framework. While in
                 ordinary environments parallel task creation and
                 scheduling is done by the operating system, or at best
                 the run-time library, in auto-scheduling task creation
                 and scheduling is performed by the user program itself,
                 making parallel processing affordable at
                 fine-granularity levels. Under auto-scheduling the
                 compiler does not only generate object code, but it
                 `lends' its knowledge about a program to the parallel
                 instruction threads of that program, allowing them to
                 manage, activate, and schedule themselves at run-time,
                 without the need of an external monitor. This is done
                 by means of special drive-code injected by the compiler
                 to each schedulable unit of a program (task, thread,
                 etc). We argue that auto-scheduling offers an optimal
                 approach for exploiting parallelism on real parallel
                 computer systems.",
  acknowledgement = ack-nhfb,
  annote =       "Title on P. 1: Auto-scheduling: control flow and data
                 flow come together. Supported in part by the National
                 Science Foundation. Supported in part by the U.S.
                 Department of Energy. Supported in part by Digital
                 Equipment Corporation.",
  keywords =     "Parallel processing (Electronic computers); Scheduling
                 (Management)",
}

@InProceedings{Presotto:1990:MSP,
  author =       "D. L. Presotto",
  booktitle =    "UKUUG. UNIX - The Legend Evolves. Proceedings of the
                 Summer 1990 UKUUG Conference",
  title =        "Multiprocessor Streams for {Plan 9}",
  publisher =    pub-UKUUG,
  address =      pub-UKUUG:adr,
  pages =        "11--19 (of xi + 260)",
  month =        "????",
  year =         "1990",
  ISBN =         "0-9513181-7-9",
  ISBN-13 =      "978-0-9513181-7-1",
  LCCN =         "????",
  bibdate =      "Sat Mar 22 15:10:17 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6150J (Operating systems)",
  conflocation = "London, UK; 9-13 July 1990",
  corpsource =   "AT&T Bell Lab., Murray Hill, NJ, USA",
  keywords =     "abstraction; input-output programs; kernel;
                 multi-threaded; multiprocessing programs;
                 multiprocessor; Plan 9 kernel; Streams; system call
                 interface; Unix",
  treatment =    "P Practical",
}

@TechReport{Saavedra-Barrera:1990:AMA,
  author =       "Rafael H. Saavedra-Barrera and David E. Culler and
                 Thorsten {Von Eiken}",
  title =        "Analysis of multithreaded architectures for parallel
                 computing",
  type =         "Report",
  number =       "UCB/CSD 90/569",
  institution =  "University of California, Berkeley, Computer Science
                 Division",
  address =      "Berkeley, CA, USA",
  pages =        "10",
  month =        apr,
  year =         "1990",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "To appear in the 2nd Annual ACM Symposium on Parallel
                 Algorithms and Architectures, Crete, Greece, July
                 1990.",
  abstract =     "Multithreading has been proposed as an architectural
                 strategy for tolerating latency in multiprocessors and,
                 through limited empirical studies, shown to offer
                 promise. This paper develops an analytical model of
                 multithreaded processor behavior based on a small set
                 of architectural and program parameters. The model
                 gives rise to a large Markov chain, which is solved to
                 obtain a formula for processor efficiency in terms of
                 the number of threads per processor, the remote
                 reference rate, the latency, and the cost of switching
                 between threads. It is shown that a multithreaded
                 processor exhibits three operating regimes: linear
                 (efficiency is proportional to the number of threads),
                 transition, and saturation (efficiency depends only on
                 the remote reference rate and switch cost). Formulae
                 for regime boundaries are derived. The model is
                 embellished to reflect cache degradation due to
                 multithreading, using an analytical model of cache
                 behavior, demonstrating that returns diminish as the
                 number threads becomes large. Predictions from the
                 embellished model correlate well with published
                 empirical measurements. Prescriptive use of the model
                 under various scenarios indicates that multithreading
                 is effective, but the number of useful threads per
                 processor is fairly small.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by NASA. Supported in part by the
                 National Science Foundation through the UCB Mammoth
                 project.",
  keywords =     "Computer architecture; Multiprocessors",
}

@Article{Schmitt:1990:CEM,
  author =       "David A. Schmitt",
  title =        "{C} Extensions For Multi-Threading",
  journal =      j-CUJ,
  volume =       "8",
  number =       "8",
  pages =        "33--??",
  month =        aug,
  year =         "1990",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@MastersThesis{Stapleton:1990:DSS,
  author =       "Joseph Francis Stapleton",
  title =        "Dynamic server selection in a multithreaded network
                 computing environment",
  type =         "Thesis ({M.S.})",
  school =       "Iowa State University",
  address =      "Ames, IA, USA",
  pages =        "66",
  year =         "1990",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@TechReport{Agarwal:1991:PTM,
  author =       "Anant Agarwal",
  title =        "Performance tradeoffs in multithreaded processors",
  type =         "Technical report",
  number =       "MIT/LCS/TR 501; VLSI memo no. 89-566",
  institution =  "Laboratory for Computer Science, Massachusetts
                 Institute of Technology",
  address =      "Cambridge, MA, USA",
  pages =        "39",
  year =         "1991",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Balter:1991:AIG,
  author =       "R. Balter and J. Bernadat and D. Decouchant and A.
                 Duda and A. Freyssinet and S. Krakowiak and M.
                 Meysembourg and P. Le Dot and H. Nguyen Van and E.
                 Paire and M. Riveill and C. Roison and X. Rousset de
                 Pina and R. Scioville and G. Vand{\^o}me",
  title =        "Architecture and Implementation of Guide, an
                 Object-Oriented Distributed System",
  journal =      j-COMP-SYS,
  volume =       "4",
  number =       "1",
  pages =        "31--67",
  month =        "Winter",
  year =         "1991",
  CODEN =        "CMSYE2",
  ISSN =         "0895-6340",
  bibdate =      "Fri Sep 13 08:51:08 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110 (Systems analysis and programming); C6150J
                 (Operating systems)",
  fjournal =     "Computing Systems",
  keywords =     "Class; Distributed object memory; Dynamic links;
                 Execution structures; Execution units; Grenoble
                 Universities integrated distributed environment; Guide;
                 Job sharing; Language; Multi-threaded virtual machines;
                 Nodes; Object model; Object-oriented distributed
                 operating system; Persistent objects storage; Single
                 inheritance; Synchronized objects; Synchronized
                 transactions; Type; UNIX",
  thesaurus =    "Distributed processing; Object-oriented programming;
                 Operating systems [computers]",
}

@Article{Beddow:1991:MTC,
  author =       "A. J. M. Beddow",
  title =        "Multi-Threaded {C} Functions",
  journal =      j-CUJ,
  volume =       "9",
  number =       "1",
  pages =        "57--??",
  month =        jan,
  year =         "1991",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@InProceedings{Bolinger:1991:PSH,
  author =       "D. Bolinger and S. Mangalat",
  title =        "Parallelizing signal handling and process management
                 in {OSF/1}",
  crossref =     "USENIX:1991:PUM",
  pages =        "105--122",
  year =         "1991",
  bibdate =      "Sat Sep 28 19:47:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Encore Computer Corp., Marlborough, MA, USA",
  classification = "C6110P (Parallel programming); C6150J (Operating
                 systems)",
  keywords =     "Mach kernel; Multi-threaded programming model;
                 Multi-threaded tasks; Multiprocessor-efficient; OSF/1
                 operating system; Parallelization; Performance
                 improvements; Process management; Races; Signal
                 handling; Synchronization problems; System calls; Unix
                 emulation; Unix process-oriented abstractions",
  thesaurus =    "Interrupts; Operating systems [computers]; Parallel
                 programming; Unix",
}

@Article{Canetti:1991:PCP,
  author =       "R. Canetti and L. P. Fertig and S. A. Kravitz and D.
                 Malki and R. Y. Pinter and S. Porat and A. Teperman",
  title =        "The parallel {C} ({pC}) programming language",
  journal =      j-IBM-JRD,
  volume =       "35",
  number =       "5/6",
  pages =        "727--741",
  month =        sep # "\slash " # nov,
  year =         "1991",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Tue Mar 25 14:26:59 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The authors describe pC (parallel C), an extension of
                 the ANSI C programming language to support medium- to
                 large-grain parallel programming in both shared- and
                 distributed-memory environments. pC aims to make
                 programming for parallel processors accessible to the C
                 community by enriching the C programming model with a
                 small set of constructs supporting parallelism. pC
                 supports shared- and distributed-memory environments
                 via a hierarchical computational model. A pC
                 application comprises a static collection of tasks with
                 disjoint memory spaces. A dynamic collection of threads
                 runs within each task, sharing the data and code of the
                 task. Language constructs specify concurrent execution
                 of threads within a single task. Additional language
                 constructs specify the interactions between threads
                 through the following mechanisms: initiation of threads
                 in remote tasks by remote function call, mailbox-based
                 message passing, and synchronization primitives. The
                 paper introduces the computational model and language
                 constructs of pC and describes a prototype pC compiler
                 and run-time system for the Mach operating system.
                 Several program examples illustrate the utility of pC
                 constructs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Technion-Israel Inst. of
                 Technol., Haifa, Israel",
  classcodes =   "C6140D (High level languages); C6110P (Parallel
                 programming); C6150C (Compilers, interpreters and other
                 processors)",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150C (Compilers, interpreters and other
                 processors)",
  corpsource =   "Dept. of Comput. Sci., Technion-Israel Inst. of
                 Technol., Haifa, Israel",
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  keywords =     "ANSI C programming language; C language; C
                 programming; C programming model; Disjoint memory
                 spaces; disjoint memory spaces; Distributed-memory;
                 distributed-memory; function call; Hierarchical
                 computational model; hierarchical computational model;
                 Language constructs; language constructs; Mach; Mach
                 operating system; Mailbox-based message passing;
                 mailbox-based message passing; model; operating system;
                 Parallel C; parallel C; parallel languages; Parallel
                 programming; parallel programming; Parallelism;
                 parallelism; PC; pC; PC compiler; pC compiler; program
                 compilers; remote; Remote function call; Run-time
                 system; run-time system; Shared memory; shared memory;
                 Synchronization; synchronization; Tasks; tasks;
                 Threads; threads",
  thesaurus =    "C language; Parallel languages; Program compilers",
  treatment =    "P Practical",
}

@Article{Ching:1991:EAP,
  author =       "W.-M. Ching and D. Ju",
  title =        "Execution of automatically parallelized {APL} programs
                 on {RP3}",
  journal =      j-IBM-JRD,
  volume =       "35",
  number =       "5/6",
  pages =        "767--777",
  month =        sep # "\slash " # nov,
  year =         "1991",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Tue Mar 25 14:26:59 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The authors have implemented an experimental APL/C
                 compiler, which accepts ordinary APL programs and
                 produces C programs. They have also implemented a
                 run-time environment that supports the parallel
                 execution of these C programs on the RP3 computer, a
                 shared-memory, 64-way MIMD machine built at the IBM
                 Thomas J. Watson Research Center. The APL/C compiler
                 uses the front end of the APL/370 compiler and imposes
                 the same restrictions, but requires no parallelization
                 directives from the user. The run-time environment is
                 based on simple synchronization primitives and is
                 implemented using Mach threads. They report the
                 speedups of several compiled programs running on RP3
                 under the Mach operating system. The current
                 implementation exploits only data parallelism. They
                 discuss the relationship between the style of an APL
                 program and its expected benefit from the automatic
                 parallel execution provided by the compiler.",
  acknowledgement = ack-nhfb,
  affiliation =  "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  classcodes =   "C6150C (Compilers, interpreters and other processors);
                 C6150N (Distributed systems); C6140D (High level
                 languages)",
  classification = "C6140D (High level languages); C6150C (Compilers,
                 interpreters and other processors); C6150N (Distributed
                 systems)",
  corpsource =   "IBM Thomas J. Watson Res. Center, Yorktown Heights,
                 NY, USA",
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  keywords =     "APL; APL/370 compiler; APL/C; APL/C compiler;
                 Automatically parallelized APL programs; automatically
                 parallelized APL programs; C language; C programs;
                 compiler; compilers; Data parallelism; data
                 parallelism; Mach operating; Mach operating system;
                 Mach threads; multiprocessing programs; program; RP3;
                 Shared-memory; shared-memory; synchronisation;
                 Synchronization primitives; synchronization primitives;
                 system",
  thesaurus =    "APL; C language; Multiprocessing programs; Program
                 compilers; Synchronisation",
  treatment =    "P Practical",
}

@Article{Chiueh:1991:MTV,
  author =       "Tzi-cker Chiueh",
  title =        "Multi-threaded vectorization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "352--361",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Culler:1991:FGPa,
  author =       "David E. Culler and Anurag Sah and Klaus E. Schauser
                 and Thorsten von Eicken and John Wawrzynek",
  title =        "Fine-grain parallelism with minimal hardware support:
                 a compiler-controlled threaded abstract machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "164--175",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Culler:1991:FGPb,
  author =       "David E. Culler and Anurag Sah and Klaus E. Schauser
                 and Thorsten von Eicken and John Wawrzynek",
  title =        "Fine-Grain Parallelism with Minimal Hardware Support:
                 a Compiler-Controlled Threaded Abstract Machine",
  journal =      j-SIGPLAN,
  volume =       "26",
  number =       "4",
  pages =        "164--175",
  month =        apr,
  year =         "1991",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat May 01 18:50:04 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Culler:1991:FGPc,
  author =       "David E. Culler and Anurag Sah and Klaus E. Schauser
                 and Thorsten von Eicken and John Wawrzynek",
  title =        "Fine-grain parallelism with minimal hardware support:
                 a compiler-controlled threaded abstract machine",
  journal =      j-OPER-SYS-REV,
  volume =       "25",
  number =       "3S",
  pages =        "164--175",
  month =        apr,
  year =         "1991",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 15:24:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Draves:1991:UCI,
  author =       "Richard P. Draves and Brian N. Bershad and Richard F.
                 Rashid and Randall W. Dean",
  title =        "Using continuations to implement thread management and
                 communication in operating systems",
  journal =      j-OPER-SYS-REV,
  volume =       "25",
  number =       "5",
  pages =        "122--136",
  month =        oct,
  year =         "1991",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@InProceedings{Faulkner:1991:PFS,
  author =       "Roger Faulkner and Ron Gomes",
  title =        "The Process File System and Process Model in {UNIX
                 System V}",
  crossref =     "USENIX:1991:PWU",
  pages =        "243--252",
  year =         "1991",
  bibdate =      "Mon Jan 02 08:29:13 2017",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix1990.bib",
  URL =          "http://obits.mlive.com/obituaries/grandrapids/obituary.aspx?pid=180588279;
                 http://thenewstack.io/remembering-roger-faulkner/;
                 https://www.usenix.org/memoriam-roger-faulkner;
                 https://www.usenix.org/sites/default/files/usenix_winter91_faulkner.pdf",
  abstract =     "We describe the process file system {\bf /proc} in
                 UNIX System V Release 4 and its relationship to the
                 UNIX process model abstraction. {\bf /proc} began as a
                 debugger interface superseding {\em ptrace(2)\/} but
                 has evolved into a general interface to the process
                 model. It provides detailed process information and
                 control mechanisms that are independent of operating
                 system implementation details and portable to a large
                 class of real architectures. Control is thorough.
                 Processes can be stopped and started on demand and can
                 be instructed to stop on events of interest: specific
                 machine faults, specific signals, and entry to or exit
                 from specific system calls. Complete encapsulation of a
                 process's execution environment is possible, as well as
                 non-intrusive inspection. Breakpoint debugging is
                 relieved from the ambiguities of signals. Security
                 provisions are complete and non-destructive.\par

                 The addition of multi-threading to the process model
                 motivates a proposal for a substantial change to the
                 {\bf /proc} interface that would replace the
                 single-level flat structure with a hierarchy of
                 directories containing status and control files. This
                 restructuring would eliminate all {\em ioctl(2)\/}
                 operations in favor of {\em read(2)\/} and {\em
                 write(2)\/} operations, which generalize more easily to
                 networks .",
  acknowledgement = ack-nhfb,
  author-dates = "Roger Faulkner (8 April 1940--2 July 2016)",
}

@Article{Gallmeister:1991:EEP,
  author =       "Bill O. Gallmeister and Chris Lanier",
  title =        "Early experience with {POSIX 1003.4} and {POSIX
                 1003.4 A}",
  journal =      j-PROC-REAL-TIME-SYS-SYMP,
  pages =        "190--198 (of ix + 307)",
  year =         "1991",
  CODEN =        "PRSYEA",
  ISBN =         "0-8186-2450-7",
  ISBN-13 =      "978-0-8186-2450-6",
  LCCN =         "QA 76.54 R43 1991",
  bibdate =      "Mon Dec 22 09:06:02 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 91CH3090-8.",
  abstract =     "Two proposed IEEE standards for real-time operating
                 systems support, POSIX.4 and POSIX.4a, are proceeding
                 towards IEEE approval and will eventually become
                 international standards. The authors provide a brief
                 overview of the facilities of POSIX.4 and POSIX.4a.
                 They concentrate on a few of the critical features that
                 POSIX.4 and POSIX.4a provide and describe the POSIX.4
                 scheduling interface. The POSIX.4a support for multiple
                 threads of control is also described. The features
                 found in POSIX.4 and POSIX.4a for synchronization of
                 multiple threads, are discussed, and the POSIX.4
                 interprocess communication facility is presented. The
                 performance numbers are given to allow comparisons of
                 the facilities of traditional UNIX systems, the
                 facilities of a representative hard real-time system
                 (LynxOS), and the facilities of POSIX.4 and POSIX.4a.",
  acknowledgement = ack-nhfb,
  classification = "722; 723; 902",
  conference =   "Proceedings of the 12th Real-Time Systems Symposium",
  conferenceyear = "1991",
  fjournal =     "Proceedings --- Real-Time Systems Symposium",
  journalabr =   "Proc Real Time Syst Symp",
  keywords =     "Computer Operating Systems--Standards; Computer
                 Systems, Digital; POSIX.4a Standards; Real Time
                 Operation; Real-Time Operating Systems",
  meetingaddress = "San Antonio, TX, USA",
  meetingdate =  "Dec 4--6 1991",
  meetingdate2 = "12/04--06/91",
  publisherinfo = "IEEE Service Center",
  sponsor =      "IEEE Computer Soc",
}

@TechReport{Glenn:1991:CMH,
  author =       "Ray R. Glenn",
  title =        "Characterizing memory hot spots in a shared memory
                 {MIMD} machine",
  type =         "Technical report",
  number =       "SRC-TR-91-039",
  institution =  inst-SRC-IDA,
  address =      inst-SRC-IDA:adr,
  pages =        "24",
  day =          "15",
  month =        oct,
  year =         "1991",
  bibdate =      "Fri Aug 30 08:01:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper analyzes two memory hot spot problems
                 associated with massively parallel MIMD computers. The
                 first is the memory stride problem, which is similar to
                 stride problems found in existing supercomputers. The
                 second hot spot problem occurs in designs that use two
                 separate memory accesses to lock and unlock critical
                 sections (split transaction) and employ a first
                 come/first serve queuing mechanism for shared memory
                 locations. A bistability in throughput brought about by
                 these conditions is analyzed and experimentally
                 demonstrated. Simple equations are presented which
                 predict the throughput at a critical section of code as
                 a function of the number of applied threads. In
                 particular, the mean size of the work items that can be
                 executed in parallel without the possibility of
                 stalling is proportional to the square of the number of
                 threads applied.",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors",
}

@InProceedings{Hirata:1991:MPA,
  author =       "H. Hirata and Y. Mochizuki and A. Nishimura and Y.
                 Nakase",
  title =        "A Multithreaded Processor Architecture with
                 Simultaneous Instruction Issuing",
  crossref =     "Anonymous:1991:PIS",
  pages =        "87--96",
  year =         "1991",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Hironaka:1991:SVP,
  author =       "T. Hironaka and T. Hashimoto and K. Okazaki and K.
                 Murakami",
  title =        "A Single-Chip Vector-Processor Prototype Based on
                 Multithreaded Streaming\slash {FIFO} ({MSFV})
                 Architecture",
  crossref =     "Anonymous:1991:PIS",
  pages =        "77--86",
  year =         "1991",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Horiguchi:1991:PEP,
  author =       "Susumu Horiguchi and Takeo Nakada",
  title =        "Performance Evaluation of Parallel Fast {Fourier}
                 Transform on a Multiprocessor Workstation",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "12",
  number =       "2",
  pages =        "158--163",
  month =        jun,
  year =         "1991",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Apr 12 17:13:17 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C4190 (Other numerical methods); C4240 (Programming
                 and algorithm theory); C5440 (Multiprocessor systems
                 and techniques)",
  corpsource =   "Dept. of Inf. Sci., Tohoku Univ., Sendai, Japan",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "algorithms; cache protocols; fast Fourier transform;
                 fast Fourier transforms; FFT; floating-; multiprocess
                 operating system; multiprocessing systems;
                 multiprocessor workstation; multithread operating
                 system; operating systems; parallel; parallel FFT;
                 performance; performance evaluation; point
                 coprocessors",
  treatment =    "P Practical",
}

@Article{Hum:1991:NHS,
  author =       "H. H. J. Hum and G. R. Gao",
  title =        "A Novel High-Speed Memory Organization for Fine-Grain
                 Multi-Thread Computing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "505",
  pages =        "34--??",
  year =         "1991",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 08:51:55 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Jolitz:1991:PUB,
  author =       "W. F. Jolitz and L. G. Jolitz",
  title =        "Porting {UNIX} to the 386. {The} basic kernel
                 Multiprogramming and multitasking. {II}",
  journal =      j-DDJ,
  volume =       "16",
  number =       "10",
  pages =        "62, 64, 66, 68, 70, 72, 118--120",
  month =        oct,
  year =         "1991",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 09:11:02 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110 (Systems analysis and programming); C6150J
                 (Operating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "386BSD kernel; Multiple simultaneous process
                 execution; Multiprogramming; Multitasking; Multithread
                 operations; Operating systems; Porting; Sleep( ); Swch(
                 ); Switching mechanisms; UNIX; Wakeup( )",
  thesaurus =    "C listings; Microprocessor chips; Multiprogramming;
                 Software portability; Unix",
}

@InProceedings{Jones:1991:BCL,
  author =       "Michael B. Jones",
  title =        "Bringing the {C} Libraries with Us into a
                 Multi-Threaded Future",
  crossref =     "USENIX:1991:PWU",
  pages =        "81--92",
  day =          "21--25",
  month =        jan,
  year =         "1991",
  bibdate =      "Fri Oct 18 07:24:24 MDT 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Carnegie Mellon University",
}

@InProceedings{Kuchlin:1991:MCI,
  author =       "Wolfgang K{\"u}chlin",
  title =        "On the multi-threaded computation of integral
                 polynomial greatest common divisors",
  crossref =     "Watt:1991:IPI",
  pages =        "333--342",
  year =         "1991",
  bibdate =      "Thu Mar 12 08:38:03 MST 1998",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/issac/120694/p333-kuchlin/",
  abstract =     "Reports experiences and practical results from
                 parallelizing the Brown--Collins polynomial g.c.d.
                 algorithm, starting from Collins' SAC-2 implementation
                 IPGCDC. The parallelization environment is PARSAC-2, a
                 multi-threaded version of SAC-2 programmed in C with
                 the parallelization constructs of the C Threads
                 library. IPGCDC computes the g.c.d. and its co-factors
                 of two polynomials in $ Z(x_1, \ldots {}, x_r) $, by
                 first reducing the problem to multiple calculations of
                 modular polynomial g.c.d.'s in $ Z_p(x_1, \ldots {},
                 x_r) $, and then recovering the result by Chinese
                 remaindering. After studying timings of the SAC-2
                 algorithm, the author first parallelizes the Chinese
                 remainder algorithm, and then parallelizes the main
                 loop of IPGCDC by executing the modular g.c.d.
                 computations concurrently. Finally, he determines
                 speed-up's and speed-up efficiencies of our parallel
                 algorithms over a wide range of polynomials. The
                 experiments were conducted on a 12 processor Encore
                 Multimax under Mach.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. and Inf. Sci., Ohio State Univ.,
                 Columbus, OH, USA",
  classification = "C4240 (Programming and algorithm theory); C7310
                 (Mathematics)",
  keywords =     "algorithms; Brown--Collins polynomial g.c.d.
                 algorithm; Chinese remaindering; Encore Multimax;
                 Multi-threaded computation; PARSAC-2; Polynomial
                 greatest common divisors",
  subject =      "{\bf G.1.0} Mathematics of Computing, NUMERICAL
                 ANALYSIS, General, Parallel algorithms. {\bf F.2.1}
                 Theory of Computation, ANALYSIS OF ALGORITHMS AND
                 PROBLEM COMPLEXITY, Numerical Algorithms and Problems,
                 Computations on polynomials. {\bf I.1.0} Computing
                 Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION,
                 General. {\bf I.1.3} Computing Methodologies, SYMBOLIC
                 AND ALGEBRAIC MANIPULATION, Languages and Systems. {\bf
                 D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, C.",
  thesaurus =    "Mathematics computing; Parallel algorithms; Symbol
                 manipulation",
}

@InProceedings{Malan:1991:MA,
  author =       "G. Malan and R. Rashid and D. Golub and R. Baron",
  title =        "{DOS} as a {Mach 3.0} application",
  crossref =     "USENIX:1991:PUM",
  pages =        "27--40",
  year =         "1991",
  bibdate =      "Sat Sep 28 19:47:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Sch. of Comput. Sci., Carnegie Mellon Univ.,
                 Pittsburgh, PA, USA",
  classification = "C6150J (Operating systems); C7430 (Computer
                 engineering)",
  keywords =     "Common DOS functions; Common DOS software; DOS
                 functionality; DOS operating system; Frequently loaded
                 DOS drivers; High memory area; High-speed space combat
                 simulation system; I/O devices; I386/i486 architecture;
                 Latency demands; Mach features; Machine-dependent
                 kernel modifications; Multiple virtual DOS
                 environments; Multithreaded emulation; PC architecture;
                 Performance sensitive PC entertainment software;
                 Timing; Unix emulation; Unix Server; VGA display;
                 Virtual 8086 mode; Virtual machine environment; Wing
                 Commander",
  thesaurus =    "IBM computers; Microcomputer applications; Supervisory
                 programs; Unix; Virtual machines",
}

@Article{Man:1991:MLC,
  author =       "Richard F. Man",
  title =        "A Multithreading Library In {C} For Subsumption
                 Architecture",
  journal =      j-CUJ,
  volume =       "9",
  number =       "11",
  pages =        "42--??",
  month =        nov,
  year =         "1991",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@Article{Marsh:1991:FCU,
  author =       "Brian D. Marsh and Michael L. Scott and Thomas J.
                 LeBlanc and Evangelos P. Markatos",
  title =        "First-class user-level threads",
  journal =      j-OPER-SYS-REV,
  volume =       "25",
  number =       "5",
  pages =        "110--121",
  month =        oct,
  year =         "1991",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@PhdThesis{Mennemeier:1991:HMS,
  author =       "Lawrence Mennemeier",
  title =        "Hardware mechanisms to support concurrent threads on
                 {RISC} and superscalar multiprocessors",
  type =         "Thesis ({M.S.})",
  school =       "University of California, Santa Cru",
  pages =        "vii + 39",
  year =         "1991",
  LCCN =         "QA76.5.M44 1991",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Masters theses -- University of California, Santa Cruz
                 -- 1991; multiprocessors; parallel processing
                 (electronic computers)",
}

@Article{Papadopoulos:1991:MRV,
  author =       "Gregory M. Papadopoulos and Kenneth R. Traub",
  title =        "Multithreading: a revisionist view of dataflow
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "342--351",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@PhdThesis{Park:1991:PTM,
  author =       "Won Woo Park",
  title =        "Performance-area trade-offs in multithreaded
                 processing unit",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Texas at Austin",
  address =      "Austin, TX, USA",
  pages =        "xvii + 165",
  year =         "1991",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Multiprocessors; Parallel
                 processing (Electronic computers)",
}

@MastersThesis{Pham:1991:EMD,
  author =       "Thuan Quang Pham",
  title =        "The experimental migration of a distributed
                 application to a multithreaded environment",
  type =         "Thesis ({M.S.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "51",
  year =         "1991",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Ponamgi:1991:DMP,
  author =       "M. Krish Ponamgi and Wenwey Hseush and Gail E.
                 Kaiser",
  title =        "Debugging Multithreaded Programs with {MPD}",
  journal =      j-IEEE-SOFTWARE,
  volume =       "8",
  number =       "3",
  pages =        "37--43",
  month =        may,
  year =         "1991",
  CODEN =        "IESOEG",
  ISSN =         "0740-7459 (print), 0740-7459 (electronic)",
  ISSN-L =       "0740-7459",
  bibdate =      "Sat Jan 25 07:35:26 MST 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Misc/IMMD_IV.bib; Parallel/debug_3.1.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept of Comput Sci, Columbia Univ, New York, NY, USA",
  classification = "723",
  fjournal =     "IEEE Software",
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/software",
  journalabr =   "IEEE Software",
  keywords =     "Computer Programming; Computer Systems, Digital ---
                 Multiprocessing; Event Recognition; Multiprocessor
                 Debugger; Multithreaded Software; Pattern Recognition;
                 Program Debugging",
}

@InProceedings{Powell:1991:SMT,
  author =       "M. L. Powell and S. R. Kleiman and S. Barton and D.
                 Shah and D. Stein and M. Weeks",
  title =        "{SunOS} Multi-thread Architecture",
  crossref =     "USENIX:1991:PWU",
  institution =  "Sun Microsystems, Inc.",
  pages =        "65--80",
  day =          "21--25",
  month =        jan,
  year =         "1991",
  bibdate =      "Wed Aug 13 10:48:45 MDT 1997",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.usenix.org/cgi-bin/sortbib.pl?-sA",
  acknowledgement = ack-nhfb,
  affiliation =  "Sun Microsystems, Inc.",
}

@Article{Richman:1991:EHC,
  author =       "Scott Richman",
  title =        "Examining the {Hamilton C} shell ({Unix} power for
                 {OS/2})",
  journal =      j-DDJ,
  volume =       "16",
  number =       "1",
  pages =        "98, 100, 102, 104--106",
  month =        jan,
  year =         "1991",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 09:11:02 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Doug Hamilton's C Shell helps you create more powerful
                 OS/2 programs.",
  acknowledgement = ack-nhfb,
  classification = "C6115 (Programming support); C6150E (General utility
                 programs); C6150J (Operating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "C shell environment; C++ programs; High-performance
                 file system; Large command lines; Long filenames; OS/2
                 features; Pipes; Presentation Manager; Script language;
                 Script program; Shell scripts; Text windows; Threads;
                 Utilities",
  thesaurus =    "C listings; Software packages; Software tools; Utility
                 programs",
}

@TechReport{Saavedra-Barrera:1991:ASM,
  author =       "Rafael H. Saavedra-Barrera and David E. Culler",
  title =        "An analytical solution for a {Markov} chain modeling
                 multithreaded execution",
  type =         "Report",
  number =       "UCB/CSD 91/623",
  institution =  "University of California, Berkeley, Computer Science
                 Division",
  address =      "Berkeley, CA, USA",
  pages =        "24",
  month =        apr,
  year =         "1991",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreading is an architectural technique aimed at
                 maintaining high processor utilization in the presence
                 of large memory or interprocessor communication
                 latency. While waiting for a remote reference to
                 complete, the processor switches to another execution
                 thread. Several realizations of this concept have been
                 proposed, but little data is available on the actual
                 costs and benefits. This paper presents an analytical
                 model of multithreaded execution, which may serve to
                 guide and explain empirical studies. The model is based
                 on three key parameters: thread run-length, switch
                 cost, and latency. A closed-form expression for
                 processor utilization is obtained for deterministic and
                 stochastic run-lengths. The derivation involves
                 identifying specific patterns in the very large set of
                 equations forming the Markov chain. Using this result,
                 three operating regimes are identified for a
                 multithreaded processor subject to long latencies:
                 linear, where utilization is proportional to the number
                 of threads per processor, saturation, where utilization
                 is determined only by the run-length and switch cost,
                 and transition between the other regimes. The model can
                 be used to estimate the effects of several
                 architectural variations.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by NASA under consortium agreement
                 NCA2-128 and cooperative agreement NCC2-550. Supported
                 in part by the National Science Foundation.",
  keywords =     "Computer architecture; Markov chains",
}

@Article{Schauser:1991:CCM,
  author =       "Klaus Erik Schauser and David E. Culler and Thorsten
                 {von Eicken}",
  title =        "Compiler-Controlled Multithreading for Lenient
                 Parallel Languages",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "523",
  pages =        "50--??",
  year =         "1991",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 08:51:55 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Schauser:1991:CDT,
  author =       "Klaus Erik Schauser",
  title =        "Compiling dataflow into threads: efficient
                 compiler-controlled multithreading for lenient parallel
                 languages",
  type =         "Thesis ({M.S.})",
  school =       "University of California, Berkeley, Computer Science
                 Division",
  address =      "Berkeley, CA, USA",
  pages =        "71",
  day =          "2",
  month =        jul,
  year =         "1991",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Also available as Report UCB/CSD 91/644",
  abstract =     "Powerful non-strict parallel languages require fast
                 dynamic scheduling. This thesis explores how the need
                 for multithreaded execution can be addressed as a
                 compilation problem, to achieve switching rates
                 approaching what hardware mechanisms might provide.
                 Compiler-controlled multithreading is examined through
                 compilation of a lenient parallel language, ID90, for a
                 threaded abstract machine, TAM. A key feature of TAM is
                 that synchronization is explicit and occurs only at the
                 start of a thread, so that a simple cost model can be
                 applied. A scheduling hierarchy allows the compiler to
                 schedule logically related threads closely together in
                 time and to use registers across threads. Remote
                 communication is via message sends and split-phase
                 memory accesses. Messages and memory replies are
                 received by compiler-generated message handlers which
                 rapidly integrate these events with thread scheduling.
                 To compile ID90 for TAM, we employ a new parallel
                 intermediate form, dual-graphs, with distinct control
                 and data arcs. This provides a clean framework for
                 partitioning the program into threads, scheduling
                 threads, and managing registers under asynchronous
                 execution. The compilation process is described and
                 preliminary measurements of the effectiveness of the
                 approach are discussed. Previous to this work,
                 execution of Id90 programs was limited to specialized
                 architectures or dataflow graph interpreters. By
                 compiling via TAM, we have achieved more than two
                 orders of magnitude performance improvement over graph
                 interpreters on conventional machines, making this Id90
                 implementation competitive with machines supporting
                 dynamic instruction scheduling in hardware. Timing
                 measurements show that our Id90 implementation on a
                 standard RISC can achieve a performance close to Id90
                 on one processor of the recent dataflow machine
                 Monsoon. It can be seen that the TAM partitioning
                 presented in this thesis reduces the control overhead
                 substantially and that more aggressive partitioning
                 would yield modest additional benefit. There is,
                 however, considerable room for improvement in
                 scheduling and register management.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the National Science Foundation.
                 Supported in part by Motorola Inc., the TRW Foundation,
                 and the International Computer Science Institute",
  keywords =     "Compilers (Computer programs); Parallel programming
                 (Computer science)",
}

@TechReport{Schauser:1991:CML,
  author =       "Klaus Erik Schauser and David E. Culler and Thorsten
                 {von Eicken}",
  title =        "Compiler-controlled multithreading for lenient
                 parallel languages",
  type =         "Report",
  number =       "UCB/CSD 91/640",
  institution =  "University of California, Berkeley, Computer Science
                 Division",
  address =      "Berkeley, CA, USA",
  pages =        "21",
  day =          "30",
  month =        jul,
  year =         "1991",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "A version of this report is to appear in the
                 Proceedings of FPCA '91 Conference on Functional
                 Programming Languages and Computer Architecture, Aug.
                 1991, Springer-Verlag",
  abstract =     "Tolerance to communication latency and inexpensive
                 synchronization are critical for general-purpose
                 computing on large multiprocessors. Fast dynamic
                 scheduling is required for powerful nonstrict parallel
                 languages. However, machines that support rapid
                 switching between multiple execution threads remain a
                 design challenge. This paper explores how multithreaded
                 execution can be addressed as a compilation problem, to
                 achieve switching rates approaching what hardware
                 mechanisms might provide. Compiler-controlled
                 multithreading is examined through compilation of a
                 lenient parallel language, Id90, for a threaded
                 abstract machine, TAM. A key feature of TAM is that
                 synchronization is explicit and occurs only at the
                 start of a thread, so that a simple cost model can be
                 applied. A scheduling hierarchy allows the compiler to
                 schedule logically related threads closely together in
                 time and to use registers across threads. Remote
                 communication is via message sends and split-phase
                 memory accesses. Messages and memory replies are
                 received [sic] by compiler-generated message handlers
                 which rapidly integrate these events with thread
                 scheduling. To compile Id90 for TAM, we employ a new
                 parallel intermediate form, dual-graphs, with distinct
                 control and data arcs. This provides a clean framework
                 for partitioning the program into threads, scheduling
                 threads, and managing registers under asynchronous
                 execution. The compilation process is described and
                 preliminary measurements of its effectiveness are
                 discussed. Dynamic execution measurements are obtained
                 via a second compilation step, which translates TAM
                 into native code for existing machines with
                 instrumentation incorporated. These measurements show
                 that the cost of compiler-controlled multithreading is
                 within a small factor of the cost of control flow in
                 sequential languages.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the National Science Foundation
                 PYI Award. Supported in part by Motorola Inc., the TRW
                 Foundation and the Semiconductor Research Corporation
                 Supported in part by J. Wawrzynek's PYI Award.
                 Supported in part by NSF Infrastructure Grant.",
  keywords =     "Compilers (Computer programs); Parallel programming
                 (Computer science)",
}

@Article{Schwan:1991:RTT,
  author =       "Karsten Schwan and Hongyi Zhou and Ahmed Gheith",
  title =        "Real-time threads",
  journal =      j-OPER-SYS-REV,
  volume =       "25",
  number =       "4",
  pages =        "35--46",
  month =        oct,
  year =         "1991",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Speer:1991:DTP,
  author =       "Thomas G. Speer and Mark W. Storm",
  title =        "{Digital}'s Transaction Processing Monitors",
  journal =      j-DEC-TECH-J,
  volume =       "3",
  number =       "1",
  pages =        "18--32",
  month =        "Winter",
  year =         "1991",
  CODEN =        "DTJOEL",
  ISSN =         "0898-901X",
  bibdate =      "Thu Mar 20 18:15:43 MST 1997",
  bibsource =    "/usr/local/src/bib/bibliography/Database/Graefe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "ftp://ftp.digital.com/pub/Digital/info/DTJ/v3n1/Digitals_Transaction_Processi_01oct1991DTJ102P8.ps;
                 http://www.digital.com:80/info/DTJ102/DTJ102SC.TXT",
  abstract =     "Digital provides two transaction processing (TP)
                 monitor products --- ACMS (Application Control and
                 Management System) and DECintact (Integrated
                 Application Control). Each monitor is a unified set of
                 transaction processing services for the application
                 environment. These services are layered on the VMS
                 operating system. Although there is a large functional
                 overlap between the two, both products achieve similar
                 goals by means of some significantly different
                 implementation strategies. Flow control and
                 multithreading in the ACMS monitor is managed by means
                 of a fourth-generation language (4GL) task definition
                 language. Flow control and multithreading in the
                 DECintact monitor is managed at the application level
                 by third-generation language (3GL) calls to a library
                 of services. The ACMS monitor supports a deferred task
                 model of queuing, and the DECintact monitor supports a
                 message-based model. Over time, the persistent
                 distinguishing feature between the two monitors will be
                 their different application programming inter faces.",
  acknowledgement = ack-nhfb,
  affiliation =  "Digital Equipment Corp., Maynard, MA, USA",
  classcodes =   "C6150J (Operating systems)",
  classification = "C6150J (Operating systems)",
  corpsource =   "Digital Equipment Corp., Maynard, MA, USA",
  fjournal =     "Digital Technical Journal",
  keywords =     "ACMS; Application; Application Control; Application
                 Control and Management System; Application programming
                 interfaces; application programming interfaces; Control
                 and Management System; DECintact; Digital; Integrated;
                 Integrated Application Control; message-based model;
                 Message-based model; monitors; Monitors;
                 Multithreading; multithreading; Queuing; queuing;
                 supervisory programs; task definition language; Task
                 definition language; transaction processing;
                 Transaction processing; transaction processing; VMS
                 operating system",
  thesaurus =    "Supervisory programs; Transaction processing",
  treatment =    "P Practical",
}

@Article{Traub:1991:MTC,
  author =       "Kenneth R. Traub",
  title =        "Multi-thread Code Generation for Dataflow
                 Architectures from Non-Strict Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "523",
  pages =        "73--??",
  year =         "1991",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 08:51:55 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Agarwal:1992:PTM,
  author =       "Anant Agarwal",
  title =        "Performance tradeoffs in multithreaded processors",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "3",
  number =       "5",
  pages =        "525--539",
  month =        sep,
  year =         "1992",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.159037",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Apr 11 15:20:39 MDT 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab for Comput Sci, MIT, Cambridge, MA, USA",
  classification = "722.1; 722.4; C4230M (Multiprocessor
                 interconnection); C4240P (Parallel programming and
                 algorithm theory); C5220P (Parallel architecture);
                 C5320G (Semiconductor storage); C5440 (Multiprocessor
                 systems and techniques); C5470 (Performance evaluation
                 and testing); C6120 (File organisation)",
  corpsource =   "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
  journalabr =   "IEEE Trans Parallel Distrib Syst",
  keywords =     "buffer storage; cache interference; Cache memories;
                 caches; contention; context-switching overhead;
                 data-sharing; Digital storage; interconnection
                 networks; Interconnection networks; multiprocessing
                 systems; multiprocessor; multithreaded processors;
                 network; network bandwidth; parallel; parallel
                 algorithms; Parallel processing systems; Performance;
                 Performance analysis; performance evaluation; Pipeline
                 processing systems; programming; storage management;
                 switching theory",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Alverson:1992:EHP,
  author =       "G. A. Alverson and R. Alverson and D. Callahan and B.
                 Koblenz",
  title =        "Exploiting Heterogeneous Parallelism on a
                 Multi-threaded Multiprocessor",
  crossref =     "ACM:1992:CPI",
  pages =        "188--197",
  year =         "1992",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Anderson:1992:SAE,
  author =       "Thomas E. Anderson and Brian N. Bershad and Edward D.
                 Lazowska and Henry M. Levy",
  title =        "Scheduler Activations: Effective Kernel Support for
                 the User-Level Management of Parallelism",
  journal =      j-TOCS,
  volume =       "10",
  number =       "1",
  pages =        "53--79",
  month =        feb,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p53-anderson/",
  abstract =     "{\em Threads\/} are the vehicle for concurrency in
                 many approaches to parallel programming. Threads can be
                 supported either by the operating system kernel or by
                 user-level library code in the application address
                 space, but neither approach has been fully
                 satisfactory. This paper addresses this dilemma. First,
                 we argue that the performance of kernel threads is {\em
                 inherently\/} worse than that of user-level threads,
                 rather than this being an artifact of existing
                 implementations; managing parallelism at the user level
                 is essential to high-performance parallel computing.
                 Next, we argue that the problems encountered in
                 integrating user-level threads with other system
                 services is a consequence of the lack of kernel support
                 for user-level threads provided by contemporary
                 multiprocessor operating systems; kernel threads are
                 the {\em wrong abstraction\/} on which to support
                 user-level management of parallelism. Finally, we
                 describe the design, implementation, and performance of
                 a new kernel interface and user-level thread package
                 that together provide the same functionality as kernel
                 threads without compromising the performance and
                 flexibility advantages of user-level management of
                 parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management, Input/output. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance.",
}

@Article{Anonymous:1992:MWPa,
  author =       "Anonymous",
  title =        "It's a Multithreaded World, Part 1: Multithreaded
                 operating systems are becoming the norm. {Here}'s how
                 your applications can exploit them",
  journal =      j-BYTE,
  volume =       "17",
  number =       "5",
  pages =        "289--??",
  month =        may,
  year =         "1992",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Tue Jan 2 10:01:41 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "BYTE Magazine",
}

@Article{Anonymous:1992:MWPb,
  author =       "Anonymous",
  title =        "It's a Multithreaded World, Part 2: Multithreaded
                 operating systems are taking over. {Are} your
                 applications ready?",
  journal =      j-BYTE,
  volume =       "17",
  number =       "6",
  pages =        "351--??",
  month =        jun,
  year =         "1992",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Tue Jan 2 10:01:41 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "BYTE Magazine",
}

@MastersThesis{Arunachalam:1992:EMM,
  author =       "Prakash Arunachalam",
  title =        "Evaluation of a multithreaded microprocessor with
                 {MIPS R3000} instruction set",
  type =         "Thesis ({M.S. in Engineering})",
  school =       "University of Texas at Austin",
  address =      "Austin, TX, USA",
  pages =        "vii + 45",
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; MIPS R3000 series
                 microprocessors; Parallel processing (Electronic
                 computers); Reduced instruction set computers; RISC
                 microprocessors",
}

@Article{Bauer:1992:PCE,
  author =       "Barr E. Bauer",
  title =        "Parallel {C} extensions",
  journal =      j-DDJ,
  volume =       "17",
  number =       "8",
  pages =        "110, 112--114, 124, 127",
  month =        aug,
  year =         "1992",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 10:06:23 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Schering-Plough Res. Inst., Bloomfield, NJ, USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150C (Compilers, interpreters and other
                 processors)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "C extensions; C programs; Parallel execution regions;
                 Parallel execution threads; Parallelized program;
                 Serial program; Silicon Graphics IRIS Power C
                 compiler",
  thesaurus =    "C language; C listings; Parallel languages; Program
                 compilers",
}

@Article{Bershad:1992:FME,
  author =       "Brian N. Bershad and David D. Redell and John R.
                 Ellis",
  title =        "Fast mutual exclusion for uniprocessors",
  journal =      j-SIGPLAN,
  volume =       "27",
  number =       "9",
  pages =        "223--233",
  month =        sep,
  year =         "1992",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:26 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/143365/p223-bershad/",
  abstract =     "In this paper we describe restartable atomic
                 sequences, an {\em optimistic\/} mechanism for
                 implementing simple atomic operations (such as {\em
                 Test-And-Set\/}) on a uniprocessor. A thread that is
                 suspended within a restartable atomic sequence is
                 resumed by the operating system at the beginning of the
                 sequence, rather than at the point of suspension. This
                 guarantees that the thread eventually executes the
                 sequence {\em atomically\/}. A restartable atomic
                 sequence has significantly less overhead than other
                 software-based synchronization mechanisms, such as
                 kernel emulation or software reservation. Consequently,
                 it is an attractive alternative for use on
                 uniprocessors that do no support atomic operations.
                 Even on processors that do support atomic operations in
                 hardware, restartable atomic sequences can have lower
                 overhead. We describe different implementations of
                 restartable atomic sequences for the Mach 3.0 and Taos
                 operating systems. These systems' thread management
                 packages rely on atomic operations to implement
                 higher-level mutual exclusion facilities. We show that
                 improving the performance of low-level atomic
                 operations, and therefore mutual exclusion mechanisms,
                 improves application performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "design; languages; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Mutual exclusion.",
}

@MastersThesis{Blumofe:1992:MSM,
  author =       "Robert D. (Robert David) Blumofe",
  title =        "Managing storage for multithreaded computations",
  type =         "Thesis ({M.S.})",
  school =       "Massachusetts Institute of Technology, Laboratory for
                 Computer Science, Department of Electrical Engineering
                 and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "83",
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Also available as Report MIT/LCS/TR 552.",
  acknowledgement = ack-nhfb,
}

@Article{Boothe:1992:IMT,
  author =       "Bob Boothe and Abhiram Ranade",
  title =        "Improved multithreading techniques for hiding
                 communication latency in multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "214--223",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Cattaneo:1992:ACT,
  author =       "G. Cattaneo and G. Di Giore and M. Ruotolo",
  title =        "Another {C} Threads Library",
  journal =      j-SIGPLAN,
  volume =       "27",
  number =       "12",
  pages =        "81--90",
  month =        dec,
  year =         "1992",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:30 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@MastersThesis{Chowdhury:1992:PEA,
  author =       "Indranil Chowdhury",
  title =        "Performance evaluation and architecture of an
                 instruction cache for multithreaded {RISC} processor",
  type =         "Thesis ({M.S. in Engineering})",
  school =       "University of Texas at Austin",
  address =      "Austin, TX, USA",
  pages =        "x + 93",
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Cache memory -- Evaluation -- Simulation methods;
                 Computer architecture; Microprocessors; Reduced
                 instruction set computers",
}

@TechReport{Culler:1992:AMMa,
  author =       "David E. Culler and Michial Gunter and James C. Lee",
  title =        "Analysis of multithreaded microprocessors under
                 multiprogramming",
  type =         "Report",
  number =       "UCB/CSD 92/687",
  institution =  "University of California, Berkeley, Computer Science
                 Division",
  address =      "Berkeley, CA, USA",
  pages =        "17",
  month =        may,
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreading has been proposed as a means of
                 tolerating long memory latencies in multiprocessor
                 systems. Fundamentally, it allows multiple concurrent
                 subsystems (cpu, network, and memory) to be utilized
                 simultaneously. This is advantageous on uniprocessor
                 systems as well, since the processor is utilized while
                 the memory system services misses. We examine
                 multithreading on high-performance uniprocessors as a
                 means of achieving better cost/performance on multiple
                 processes. Processor utilization and cache behavior are
                 studied both analytically and through simulation of
                 timesharing and multithreading using interleaved
                 reference traces. Multithreading is advantageous when
                 one has large on-chip caches (32 kilobytes),
                 associativity of two, and a memory access cost of
                 roughly 50 instruction times. At this point, a small
                 number of threads (2-4) is sufficient, the thread
                 switch need not be extraordinarily fast, and the memory
                 system need support only one or two outstanding misses.
                 The increase in processor real-estate to support
                 multithreading is modest, given the size of the cache
                 and floating-point units. A surprising observation is
                 that miss ratios may be lower with multithreading than
                 with timesharing under a steady-state load. This occurs
                 because switch-on-miss multithreading introduces unfair
                 thread scheduling, giving more CPU cycles to processes
                 with better cache behavior.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the National Science Foundation.
                 Supported in part by Motorola Inc. and the TRW
                 Foundation",
  keywords =     "Microprocessors; Multiprogramming (Electronic
                 computers)",
}

@Article{Culler:1992:AMMb,
  author =       "David E. Culler and Michial Gunter and James C. Lee",
  title =        "Analysis of multithreaded microprocessors under
                 multiprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "438--438",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Day:1992:INB,
  author =       "Michael Day",
  title =        "Implementing {NLM-Based} Client\slash Server
                 Architectures",
  journal =      j-DDJ,
  volume =       "17",
  number =       "10",
  pages =        "78--84",
  month =        oct,
  year =         "1992",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:34 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "NetWare NLMs take full advantage of the multitasking,
                 multithreaded architecture of the operating system.
                 Michael presents a distributed file manager made up of
                 two modules: ENGINE.NLM, an NLM running on a NetWare
                 3.x server, and CLIENT.EXE, a DOS-based front end
                 running on the client.",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "32-Bit protected-mode programs; Client/server
                 architectures; Distributed file manager; DOS-based
                 front end; Multitasking; Multithreaded architecture;
                 NetWare 3.x operating system; Netware Loadable Modules;
                 Networked system",
  thesaurus =    "Distributed processing; File servers",
}

@Article{Day:1992:INC,
  author =       "Michael Day",
  title =        "Implementing {NLM-Based} Client\slash Server
                 Architectures",
  journal =      j-DDJ,
  volume =       "17",
  number =       "10",
  pages =        "78--84",
  month =        oct,
  year =         "1992",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:34 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "NetWare NLMs take full advantage of the multitasking,
                 multithreaded architecture of the operating system.
                 Michael presents a distributed file manager made up of
                 two modules: ENGINE.NLM, an NLM running on a NetWare
                 3.x server, and CLIENT.EXE, a DOS-based front end
                 running on the client.",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "32-Bit protected-mode programs; Client/server
                 architectures; Distributed file manager; DOS-based
                 front end; Multitasking; Multithreaded architecture;
                 NetWare 3.x operating system; Netware Loadable Modules;
                 Networked system",
  thesaurus =    "Distributed processing; File servers",
}

@Article{DHollander:1992:PLL,
  author =       "Erik H. D'Hollander",
  title =        "Partitioning and labeling of loops by unimodular
                 transformations",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "3",
  number =       "4",
  pages =        "465--476",
  month =        jul,
  year =         "1992",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.149964",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  MRclass =      "68Q10 (68Q22)",
  MRnumber =     "93f:68030",
  bibdate =      "Mon Apr 14 07:37:07 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept of Electr Eng, State Univ of Ghent, Belgium",
  classification = "722; 723; C4240P (Parallel programming and algorithm
                 theory); C6110P (Parallel programming); C6150C
                 (Compilers, interpreters and other processors)",
  corpsource =   "Dept. of Electr. Eng., State Univ. of Ghent, Belgium",
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
  journalabr =   "IEEE Trans Parallel Distrib Syst",
  keywords =     "computational complexity; Computer Programming ---
                 Algorithms; Computer Systems Programming; constant
                 dependence vectors; dependence matrix; dependent
                 iterations; do-loops; fold nested loop; independent
                 subsets; invariant dependence; join; labelling
                 algorithm; loop labelling; loop partitioning;
                 Multiprocessing Programs; multithreaded dynamic
                 scheduling; n-; parallel; parallel algorithms; parallel
                 DO-ALL loops; partitioning algorithm; Partitioning
                 Algorithms; primitive; program compilers; Program
                 Transformations; programming; programming theory;
                 relation; scheduling; serial loop; transformation;
                 unimodular; Unimodular Transformations; unimodular
                 transformations",
  treatment =    "T Theoretical or Mathematical",
}

@MastersThesis{Donalson:1992:DDP,
  author =       "Douglas Dale Donalson",
  title =        "{DISC}: a dynamic performance evaluation of a
                 multi-thread architecture",
  type =         "Thesis ({M.S.})",
  school =       "Electrical and Computer Engineering Department,
                 University of California, Santa Barbara",
  address =      "Santa Barbara, CA, USA",
  pages =        "ix + 88",
  year =         "1992",
  LCCN =         "TK174.C2 S25 DOND 1992",
  bibdate =      "Sat Apr 20 11:18:53 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Eykholt:1992:BMM,
  author =       "J. R. Eykholt and S. R. Kleiman and S. Barton and R.
                 Faulkner and D. Stein and M. Smith and A. Shivalingiah
                 and J. Voll and M. Weeks and D. Williams",
  title =        "Beyond Multiprocessing: Multithreading the {System V
                 Release} 4 Kernel",
  crossref =     "USENIX:1992:PSU",
  pages =        "11--18",
  month =        "Summer",
  year =         "1992",
  bibdate =      "Fri Oct 18 07:24:24 MDT 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "SunSoft Inc.",
}

@TechReport{Felten:1992:IPM,
  author =       "Edward W. Felten and Dylan James McNamee",
  title =        "Improving the performance of message-passing
                 applications by multithreading",
  type =         "Technical report",
  number =       "92-09-07",
  institution =  "University of Washington, Dept. of Computer Science
                 and Engineering",
  address =      "Seattle, WA, USA",
  pages =        "6",
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Achieving maximum performance in message-passing
                 programs requires that calculation and communication be
                 overlapped. However, the program transformations
                 required to achieve this overlap are error-prone and
                 add significant complexity to the application program.
                 We argue that calculation/communication overlap can be
                 achieved easily and consistently by executing multiple
                 threads of control on each processor, and that this
                 approach is practical on message-passing architectures
                 without any special hardware support. We present timing
                 data for a typical message-passing application, to
                 demonstrate the advantages of our scheme.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the National Science Foundation.
                 Supported in part by the Washington Technology Center,
                 Digital Equipment Corporation, Apple Computer Company,
                 a Mercury Seven Fellowship and an AT\&T Ph.D.
                 Scholarship",
  keywords =     "Operating systems",
}

@TechReport{Gokhale:1992:ICI,
  author =       "Maya B. Gokhale and William W. Carlson",
  title =        "An introduction to compilation issues for parallel
                 machines",
  type =         "Technical report",
  number =       "SRC-TR-92-062",
  institution =  inst-SRC-IDA,
  address =      inst-SRC-IDA:adr,
  pages =        "38",
  day =          "8",
  month =        sep,
  year =         "1992",
  bibdate =      "Fri Aug 30 08:01:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The exploitation of today's high-performance computer
                 systems requires the effective use of parallelism in
                 many forms and at numerous levels. This survey article
                 discusses program analysis and restructuring techniques
                 that target parallel architectures. We first describe
                 various categories of architectures that are oriented
                 toward parallel computation models: vector
                 architectures, shared memory multiprocessors, massively
                 parallel machines, message-passing architectures,
                 VLIWs, and multithreaded architectures. We then
                 describe a variety of optimization techniques that can
                 be applied to sequential programs to effectively
                 utilize the vector and parallel processing units. After
                 an overview of basic dependence analysis, we present
                 restructuring transformations on DO loops targeted both
                 to vectorization and to concurrent execution,
                 interprocedural and pointer analysis, task scheduling,
                 instruction level parallelization, and
                 compiler-assisted data placement. We conclude that
                 although tremendous advances have been made in
                 dependence theory and in the development of a `toolkit'
                 of transformations, parallel systems are used most
                 effectively when the programmer interacts in the
                 optimization process.",
  acknowledgement = ack-nhfb,
  keywords =     "Compilers (Computer programs); Computer architecture;
                 Parallel processing (Electronic computers)",
}

@Article{Govindarajan:1992:LCM,
  author =       "R. Govindarajan and S. S. Nemawarkar",
  title =        "A Large Context Multithreaded Architecture",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "634",
  pages =        "423--??",
  year =         "1992",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:46:24 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Haines:1992:SMC,
  author =       "Matt Haines and Anton Pedro Willem Bohm",
  title =        "Software multithreading in a conventional distributed
                 memory multiprocessor",
  type =         "Technical report",
  number =       "CS-92-126",
  institution =  "Colorado State University, Dept. of Computer Science",
  address =      "Fort Collins, CO, USA",
  pages =        "25",
  day =          "25",
  month =        sep,
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Today's most powerful computers are distributed memory
                 multiprocessors. Although they possess massive amounts
                 of available resources, it is often difficult to
                 exploit these resources efficiently. Compilers that can
                 cope with the complexities of these systems are being
                 constructed, but their scope of effect is often limited
                 due to the complexity of the analysis and the lack of
                 runtime information. Novel architectures that can
                 better tolerate latencies are under construction, but
                 their effectiveness is unproven, and they do little to
                 ease the burden on current commercial machines.
                 Therefore we are designing a runtime system, called
                 VISA, that attempts to avoid and tolerate latencies on
                 conventional distributed memory multiprocessors, as
                 well as provide a single addressing space to ease the
                 burden of programming or code generation. The goal of
                 our runtime system is to serve as a tool for studying
                 the effects of latency avoidance and latency tolerance
                 on programs running on these conventional
                 architectures. In this paper we describe the design and
                 implementation of multithreading in the VISA runtime
                 system for the purpose of latency tolerance. In
                 particular, we examine machine-independent designs for
                 thread representation, thread switching, and
                 split-phased transactions. We quantify the cost of
                 multithreading for our environment, present a test
                 program for which multithreading degrades performance,
                 and present a program for which multithreading enhances
                 performance.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by a grant from Sandia National
                 Laboratories",
  keywords =     "Multiprocessors",
}

@Article{Halladay:1992:PUM,
  author =       "Steve Halladay and Michael Wiebel",
  title =        "A Practical Use For Multiple Threads",
  journal =      j-CUJ,
  volume =       "10",
  number =       "1",
  pages =        "73--??",
  month =        jan,
  year =         "1992",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@Article{Hirata:1992:EPA,
  author =       "Hiroaki Hirata and Kozo Kimura and Satoshi Nagamine
                 and Yoshiyuki Mochizuki and Akio Nishimura and
                 Yoshimori Nakase and Teiji Nishizawa",
  title =        "An elementary processor architecture with simultaneous
                 instruction issuing from multiple threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "136--145",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Hirata:1992:MPA,
  author =       "H. Hirata and Y. Mochizuki and A. Nishmura and Y.
                 Nakase and T. Nishizawa",
  title =        "A multithreaded processor architecture with
                 simultaneous instruction issuing",
  journal =      j-SUPERCOMPUTER,
  volume =       "9",
  number =       "3",
  pages =        "23--39",
  month =        may,
  year =         "1992",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Wed Mar 18 08:37:01 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka,
                 Japan",
  classification = "C5220P (Parallel architecture); C6110P (Parallel
                 programming); C6150J (Operating systems)",
  corpsource =   "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka,
                 Japan",
  fjournal =     "Supercomputer",
  keywords =     "functional unit; independent instruction streams;
                 multiprogramming; multithreaded processor architecture;
                 parallel processing; scheduling; simultaneous
                 instruction issuing; vector machines; VLW machines",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@InProceedings{Hironaka:1992:BVP,
  author =       "T. Hironaka and T. Hashimoto and K. Okazaki and K.
                 Murakami",
  title =        "Benchmarking a Vector-Processor Prototype Based on
                 Multithreaded Streaming\slash {FIFO} Vector ({MSFV})
                 Architecture",
  crossref =     "ACM:1992:CPI",
  pages =        "272--281",
  year =         "1992",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Hum:1992:HSM,
  author =       "Herbert H. J. Hum and Guang R. Gao",
  title =        "A high-speed memory organization for hybrid
                 dataflow\slash {von Neumann} computing",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "8",
  number =       "4",
  pages =        "287--301",
  month =        sep,
  year =         "1992",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jul 15 09:06:02 MDT 2005",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Os/threads.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/0167739X",
  abstract =     "The paper proposes a novel organization of high-speed
                 memories, known as the register-cache, for a
                 multi-threaded architecture. Viewed from the execution
                 unit, its contents are addressable as ordinary CPU
                 registers using relatively short addresses. From the
                 main memory perspective, it is content addressable. In
                 this register-cache organization, a number of registers
                 are grouped into a block of registers where a register
                 in a block is accessed using an offset from the address
                 of the block, an offset value which is embedded in the
                 compiler generated code. The binding of register block
                 locations to register-cache line addresses is
                 adaptively performed at runtime, thus resulting in a
                 dynamically allocated register file. In this execution
                 model, a program is compiled into a number of
                 instruction threads called super-actors. A super-actor
                 becomes ready for execution only when its input data
                 are physically residing in the register-cache and space
                 is reserved in the register-cache to store its
                 result.",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Jagannathan:1992:CSC,
  author =       "Suresh Jagannathan and Jim Philbin",
  title =        "A customizable substrate for concurrent languages",
  journal =      j-SIGPLAN,
  volume =       "27",
  number =       "7",
  pages =        "55--67",
  month =        jul,
  year =         "1992",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-475-9",
  ISBN-13 =      "978-0-89791-475-8",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  LCCN =         "QA76.7.S53 1992",
  bibdate =      "Sun Dec 14 09:16:22 MST 2003",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/proceedings/pldi/143095/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p55-jagannathan/",
  abstract =     "We describe an approach to implementing a wide-range
                 of concurrency paradigms in high-level (symbolic)
                 programming languages. The focus of our discussion is
                 STING, a dialect of Scheme, that supports lightweight
                 threads of control and virtual processors as
                 first-class objects. Given the significant degree to
                 which the behavior of these objects may be customized,
                 we can easily express a variety of concurrency
                 paradigms and linguistic structures within a common
                 framework without loss of efficiency. Unlike parallel
                 systems that rely on operating system services for
                 managing concurrency, STING implements concurrency
                 management entirely in terms of Scheme objects and
                 procedures. It, therefore, permits users to optimize
                 the runtime behavior of their applications without
                 requiring knowledge of the underlying runtime system.
                 This paper concentrates on (a) the implications of the
                 design for building asynchronous concurrency
                 structures, (b) organizing large-scale concurrent
                 computations, and (c) implementing robust programming
                 environments for symbolic computing.",
  acknowledgement = ack-nhfb,
  affiliation =  "NEC Research Inst",
  affiliationaddress = "Princeton, NJ, USA",
  annote =       "Published as part of the Proceedings of PLDI'92.",
  classification = "723.1",
  conference =   "Proceedings of the ACM SIGPLAN '92 Conference on
                 Programming Language Design and Implementation",
  conferenceyear = "1992",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  journalabr =   "SIGPLAN Not",
  keywords =     "algorithms; Computer programming languages;
                 Concurrency paradigms; Concurrency structures; design;
                 languages; Parallel processing systems; performance;
                 Robust programming; Symbolic programming languages",
  meetingaddress = "San Francisco, CA, USA",
  meetingdate =  "Jun 17--19 1992",
  meetingdate2 = "06/17--19/92",
  sponsor =      "ACM",
  subject =      "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Concurrent, distributed, and parallel
                 languages. {\bf D.3.2} Software, PROGRAMMING LANGUAGES,
                 Language Classifications, SCHEME. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Parallel programming.",
}

@Article{Koopman:1992:CBC,
  author =       "Philip J. {Koopman, Jr.} and Peter Lee and Daniel P.
                 Siewiorek",
  title =        "Cache Behavior of Combinator Graph Reduction",
  journal =      j-TOPLAS,
  volume =       "14",
  number =       "2",
  pages =        "265--297",
  month =        apr,
  year =         "1992",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Sat Jan 06 14:28:31 1996",
  bibsource =    "Compiler/Compiler.Lins.bib;
                 Compiler/garbage.collection.bib; Compiler/Heaps.bib;
                 Compiler/TOPLAS.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Theory/CLiCS.bib",
  note =         "Also see~\cite{Koopman:1992:CBC}.",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/0164-0925/128867.html",
  abstract =     "The results of cache-simulation experiments with an
                 abstract machine for reducing combinator graphs are
                 presented. The abstract machine, called TIGRE, exhibits
                 reduction rates that, for similar kinds of combinator
                 graphs on similar kinds of hardware, compare favorably
                 with previously reported techniques. Furthermore, TIGRE
                 maps easily and efficiently onto standard computer
                 architectures, particularly those that allow a
                 restricted form of self-modifying code. This provides
                 some indication that the conventional ``stored
                 program'' organization of computer systems is not
                 necessarily an inappropriate one for functional
                 programming language implementations.\par

                 This is not to say, however, that present day computer
                 systems are well equipped to reduce combinator graphs.
                 In particular, the behavior of the cache memory has a
                 significant effect on performance. In order to study
                 and quantify this effect, trace-driven cache
                 simulations of a TIGRE graph reducer running on a
                 reduced instruction-set computer are conducted. The
                 results of these simulations are presented with the
                 following hardware-cache parameters varied: cache size,
                 block size, associativity, memory update policy, and
                 write-allocation policy. To begin with, the cache
                 organization of a commercially available system is used
                 and then the performance sensitivity with respect to
                 variations of each parameter are measured. From the
                 results of the simulation study, a conclusion is made
                 that combinator-graph reduction using TIGRE runs most
                 efficiently when using a cache memory with an
                 allocate-on-write-miss strategy, moderately large block
                 size (preferably with subblock placement), and
                 copy-back memory updates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "algorithms; languages; performance; theory;
                 threading",
  sjb =          "In amongst all the cache stuff is a description of how
                 subroutine threading can form the basis for a
                 relatively efficient method of performing combinator
                 graph reduction.",
  subject =      "{\bf B.3.2}: Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf B.3.3}: Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids,
                 Simulation. {\bf D.1.1}: Software, PROGRAMMING
                 TECHNIQUES, Applicative (Functional) Programming. {\bf
                 D.3.2}: Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Applicative languages. {\bf D.3.4}:
                 Software, PROGRAMMING LANGUAGES, Processors, Compilers.
                 {\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
                 Processors, Interpreters. {\bf G.2.1}: Mathematics of
                 Computing, DISCRETE MATHEMATICS, Combinatorics.",
}

@Article{Kuchlin:1992:MTC,
  author =       "W. Kuchlin",
  title =        "On the Multi-Threaded Computation of Modular
                 Polynomial Greatest Common Divisors",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "591",
  pages =        "369--??",
  year =         "1992",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:46:24 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lenir:1992:EIL,
  author =       "Philip Lenir and R. Govindarajan and S. S.
                 Nemawarkar",
  title =        "Exploiting instruction-level parallelism: the
                 multithreaded approach",
  journal =      j-SIGMICRO,
  volume =       "23",
  number =       "1--2",
  pages =        "189--192",
  month =        dec,
  year =         "1992",
  DOI =          "https://doi.org/10.1145/144965.145798",
  bibdate =      "Fri Apr 16 10:27:43 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigmicro.bib",
  URL =          "https://dl.acm.org/doi/10.1145/144965.145798",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMICRO Newsletter",
  journal-URL =  "https://dl.acm.org/loi/sigmicro",
}

@Article{LeSergent:1992:IMT,
  author =       "T. {Le Sergent} and B. Berthomieu",
  title =        "Incremental Multi-Threaded Garbage Collection on
                 Virtually Shared Memory Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "637",
  pages =        "179--??",
  year =         "1992",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:46:24 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nikhil:1992:MMP,
  author =       "R. S. Nikhil and G. M. Papadopoulos and Arvind",
  title =        "{T}: a multithreaded massively parallel architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "156--167",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Ogata:1992:DIH,
  author =       "Kazuhiro Ogata and Satoshi Kurihara and Mikio Inari
                 and Norihisa Doi",
  title =        "The design and implementation of {HoME}",
  journal =      j-SIGPLAN,
  volume =       "27",
  number =       "7",
  pages =        "44--54",
  month =        jul,
  year =         "1992",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-475-9",
  ISBN-13 =      "978-0-89791-475-8",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  LCCN =         "QA76.7.S53 1992",
  bibdate =      "Sun Dec 14 09:16:22 MST 2003",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/proceedings/pldi/143095/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p44-ogata/",
  abstract =     "HoME is a version of Smalltalk which can be
                 efficiently executed on a multiprocessor and can be
                 executed in parallel by combining a Smalltalk process
                 with a Mach thread and executing the process on the
                 thread. HoME is nearly the same as ordinary Smalltalk
                 except that multiple processes may execute in parallel.
                 Thus, almost all applications running on ordinary
                 Smalltalk can be executed on HoME without changes in
                 their code. HoME was designed and implemented based on
                 the following fundamental policies: (1) theoretically,
                 an infinite number of processes can become active; (2)
                 the moment a process is scheduled, it becomes active;
                 (3) no process switching occurs; (4) HoME is equivalent
                 to ordinary Smalltalk except for the previous three
                 policies. The performance of the current implementation
                 of HoME running on OMRON LUNA-88K, which had four
                 processors, was measured by benchmarks which execute in
                 parallel with multiple processes. In all benchmarks,
                 the results showed that HoME's performance is much
                 better than HPS on the same workstation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Keio Univ",
  affiliationaddress = "Yokohama, Jpn",
  annote =       "Published as part of the Proceedings of PLDI'92.",
  classification = "723.1",
  conference =   "Proceedings of the ACM SIGPLAN '92 Conference on
                 Programming Language Design and Implementation",
  conferenceyear = "1992",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  journalabr =   "SIGPLAN Not",
  keywords =     "Computer programming; design; HPS on Mach environment;
                 languages; measurement; Object oriented programming;
                 performance; Smalltalk",
  meetingaddress = "San Francisco, CA, USA",
  meetingdate =  "Jun 17--19 1992",
  meetingdate2 = "06/17--19/92",
  sponsor =      "ACM",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming. {\bf D.3.4} Software,
                 PROGRAMMING LANGUAGES, Processors. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Smalltalk. {\bf D.2.8} Software,
                 SOFTWARE ENGINEERING, Metrics, Performance measures.",
}

@InProceedings{Papadopoulos:1992:MCS,
  author =       "G. M. Papadopoulos and A. P. W. Bohm and A. T. Dahbura
                 and R. R. Oldehoeft",
  title =        "Multithreaded computer systems",
  crossref =     "IEEE:1992:PSM",
  pages =        "772--775",
  year =         "1992",
  bibdate =      "Wed Apr 15 15:37:20 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture)",
  corpsource =   "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  keywords =     "architectural principles; data matching; multithreaded
                 computer systems; parallel architectures; parallel
                 machines; split-phase memory accesses",
  sponsororg =   "IEEE; ACM",
  treatment =    "P Practical",
}

@InProceedings{Peacock:1992:EMS,
  author =       "J. Kent Peacock and Sunil Saxena and Dean Thomas and
                 Fred Yang and Wilfred Yu",
  title =        "Experiences from Multithreading System {V} Release 4",
  crossref =     "USENIX:1992:SED",
  pages =        "77--92",
  day =          "26--27",
  month =        mar,
  year =         "1992",
  bibdate =      "Fri Oct 18 07:24:24 MDT 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Multiprocessor Consortium",
}

@InProceedings{Peacock:1992:FSM,
  author =       "J. Kent Peacock",
  title =        "File System Multithreading in {System V Release} 4
                 {MP}",
  crossref =     "USENIX:1992:PSU",
  pages =        "19--30",
  month =        "Summer",
  year =         "1992",
  bibdate =      "Tue Feb 20 15:42:13 MST 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Multi-Processor Consortium",
}

@InProceedings{Pham:1992:MDA,
  author =       "Thuan Q. Pham and Pankaj K. Garg",
  title =        "On Migrating a Distributed Application to a
                 Multithreaded Environment",
  crossref =     "USENIX:1992:PSU",
  pages =        "45--54",
  month =        "Summer",
  year =         "1992",
  bibdate =      "Fri Oct 18 07:24:24 MDT 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Hewlett--Packard Laboratories",
}

@Article{Sato:1992:TBP,
  author =       "Mitsuhisa Sato and Yuetsu Kodama and Shuichi Sakai and
                 Yoshinori Yamaguchi and Yasuhito Koumura",
  title =        "Thread-based programming for the {EM-4} hybrid
                 dataflow machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "146--155",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Schwan:1992:MRT,
  author =       "Karsten Schwan and Hongyi Zhou",
  title =        "Multiprocessor real-time threads",
  journal =      j-OPER-SYS-REV,
  volume =       "26",
  number =       "1",
  pages =        "54--65",
  month =        jan,
  year =         "1992",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Singh:1992:DRS,
  author =       "Gurjot Singh and Moses Joseph and Dave Barnett",
  title =        "Debugging real-time systems",
  journal =      j-DDJ,
  volume =       "17",
  number =       "9",
  pages =        "70, 72, 74, 76--77, 116--117",
  month =        sep,
  year =         "1992",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 10:06:23 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Modular and incremental development and debugging lead
                 to reliable real-time systems that perform the
                 functions they're designed to. Our authors use this
                 approach when building a simulated data-acquisition
                 system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lynx Real-Time Syst., Los Gatos, CA, USA",
  classification = "C6150G (Diagnostic, testing, debugging and
                 evaluating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "Correctness; Debugging cycle; Ldb; POSIX; Real-time
                 systems; User-friendly multithreaded debugger;
                 Worst-case performance",
  thesaurus =    "C listings; Program debugging; Real-time systems",
}

@Article{Singh:1992:DRT,
  author =       "Gurjot Singh and Moses Joseph and Dave Barnett",
  title =        "Debugging real-time systems",
  journal =      j-DDJ,
  volume =       "17",
  number =       "9",
  pages =        "70, 72, 74, 76--77, 116--117",
  month =        sep,
  year =         "1992",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 10:06:23 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Modular and incremental development and debugging lead
                 to reliable real-time systems that perform the
                 functions they're designed to. Our authors use this
                 approach when building a simulated data-acquisition
                 system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lynx Real-Time Syst., Los Gatos, CA, USA",
  classification = "C6150G (Diagnostic, testing, debugging and
                 evaluating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "Correctness; Debugging cycle; Ldb; POSIX; Real-time
                 systems; User-friendly multithreaded debugger;
                 Worst-case performance",
  thesaurus =    "C listings; Program debugging; Real-time systems",
}

@Article{Smith:1992:MTX,
  author =       "John Allen Smith",
  title =        "The Multi-Threaded {X} Server",
  journal =      j-X-RESOURCE,
  volume =       "1",
  number =       "1",
  pages =        "73--89",
  month =        jan,
  year =         "1992",
  CODEN =        "XRESEA",
  ISBN =         "0-937175-96-X",
  ISBN-13 =      "978-0-937175-96-5",
  ISSN =         "1058-5591",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The X Resource",
}

@PhdThesis{Young-Myers:1992:DTC,
  author =       "Helene Wen-Hsin Young-Myers",
  title =        "Database transitive closure: a performance study of
                 multithreaded algorithms",
  type =         "Thesis ({Ph.D.})",
  school =       "College of Business and Management, University of
                 Maryland at College Park",
  address =      "College Park, MD, USA",
  pages =        "ix + 198",
  year =         "1992",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Agarwal:1993:SMV,
  author =       "Anant Agarwal and Jonathan Babb and David Chaiken and
                 Godfrey D'Souza and Kirk Johnson and David Kranz and
                 John Kubiatowicz and Beng-Hong Lim and Gino Maa and Ken
                 Mackenzie",
  title =        "Sparcle: a Multithreaded {VLSI} Processor for
                 Parallel Computing",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "748",
  pages =        "359--??",
  year =         "1993",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:49:00 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bic:1993:EUI,
  author =       "Lubomir Bic and Mayez Al-Mouhamed",
  title =        "The {EM-4} under Implicit Parallelism",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "19",
  number =       "3",
  pages =        "255--261",
  month =        nov,
  year =         "1993",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1993.1109",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:53 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C6110P (Parallel
                 programming)",
  corpsource =   "Dept. of Inf. and Comput. Sci., California Univ.,
                 Irvine, CA, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "analysis; benchmark programs; data distribution;
                 data-dependency; Data-Distributed Execution; DDE; EM-4;
                 implicit parallelism; interprocessor communication;
                 iteration-level parallelism; loops; multithreading;
                 parallel architectures; parallel programming;
                 parallelization",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@InProceedings{Blumofe:1993:SES,
  author =       "Robert D. Blumofe and Charles E. Leiserson",
  title =        "Space-efficient scheduling of multithreaded
                 computations",
  crossref =     "ACM:1993:PTF",
  pages =        "362--371",
  year =         "1993",
  bibdate =      "Wed Feb 20 18:34:01 MST 2002",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/series/stoc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/articles/proceedings/stoc/167088/p362-blumofe/p362-blumofe.pdf;
                 http://www.acm.org/pubs/citations/proceedings/stoc/167088/p362-blumofe/",
  acknowledgement = ack-nhfb,
}

@PhdThesis{Boothe:1993:EMC,
  author =       "Bob Boothe",
  title =        "Evaluation of multithreading and caching in large
                 shared memory parallel computers",
  type =         "Thesis ({Ph.D.})",
  school =       "University of California, Berkeley, Computer Science
                 Division",
  address =      "Berkeley, CA, USA",
  pages =        "ix + 169",
  month =        jul,
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Also available as Report UCB/CSD 93/766.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the Air Force Office of
                 Scientific Research (AFOSR/JSEP), by the NSF, and by an
                 NSF Infrastructure Grant.",
  keywords =     "Multiprocessors",
}

@MastersThesis{Chong:1993:EMC,
  author =       "Yong-Kim Chong",
  title =        "Effects of memory consistency models on multithreaded
                 multiprocessor performance",
  type =         "Thesis ({M.S.})",
  school =       "University of Southern California",
  address =      "Los Angeles, CA, USA",
  pages =        "viii + 89",
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Culler:1993:TCC,
  author =       "David E. Culler and Seth Copen Goldstein and Klaus
                 Erik Schauser and Thorsten {Von Eicken}",
  title =        "{TAM} -- a Compiler Controlled {Threaded Abstract
                 Machine}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "18",
  number =       "3",
  pages =        "347--370",
  month =        jul,
  year =         "1993",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1993.1070",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:52 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture)",
  corpsource =   "Div. of Comput. Sci., California Univ., Berkeley, CA,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "dataflow execution models; parallel architectures;
                 parallel programming; parallel threads; self-scheduled
                 machine language; Threaded Abstract Machine",
  treatment =    "P Practical",
}

@Article{Dillon:1993:VEM,
  author =       "Laura K. Dillon",
  title =        "A visual execution model for {Ada} tasking",
  journal =      j-TOSEM,
  volume =       "2",
  number =       "4",
  pages =        "311--345",
  month =        oct,
  year =         "1993",
  CODEN =        "ATSMER",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Fri Apr 20 08:21:35 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tosem/1993-2-4/p311-dillon/p311-dillon.pdf;
                 http://www.acm.org/pubs/citations/journals/tosem/1993-2-4/p311-dillon/",
  abstract =     "A visual execution model for Ada tasking can help
                 programmers attain a deeper understanding of the
                 tasking semantics. It can illustrate subtleties in
                 semantic definitions that are not apparent in natural
                 language design. We describe a contour model of Ada
                 tasking that depicts asynchronous tasks (threads of
                 control), relationships between the environments in
                 which tasks execute, and the manner in which tasks
                 interact. The use of this high-level execution model
                 makes it possible to see what happens during execution
                 of a program. The paper provides an introduction to the
                 contour model of Ada tasking and demonstrates its
                 use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  generalterms = "Algorithms; Design; Languages",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J790",
  keywords =     "contour model; visual execution model",
  subject =      "Software --- Software Engineering --- Design Tools and
                 Techniques (D.2.2); Software --- Software Engineering
                 --- Programming Environments (D.2.6); Software ---
                 Programming Languages --- Formal Definitions and Theory
                 (D.3.1): {\bf Semantics}; Software --- Programming
                 Languages --- Language Classifications (D.3.2): {\bf
                 Ada}; Software --- Programming Languages --- Language
                 Constructs and Features (D.3.3): {\bf Concurrent
                 programming structures}; Software --- Programming
                 Techniques --- Concurrent Programming (D.1.3); Theory
                 of Computation --- Logics and Meanings of Programs ---
                 Semantics of Programming Languages (F.3.2): {\bf
                 Operational semantics}; Software --- Programming
                 Languages --- Processors (D.3.4): {\bf Interpreters}",
}

@InProceedings{Doligez:1993:CGG,
  author =       "Damien Doligez and Xavier Leroy",
  title =        "A concurrent, generational garbage collector for a
                 multithreaded implementation of {ML}",
  crossref =     "ACM:1993:CRT",
  pages =        "113--123",
  year =         "1993",
  bibdate =      "Mon May 3 12:45:53 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p113-doligez/",
  abstract =     "This paper presents the design and implementation of a
                 ``quasi real-time'' garbage collector for Concurrent
                 Caml Light, an implementation of ML with threads. This
                 two-generation system combines a fast, asynchronous
                 copying collector on the young generation with a
                 non-disruptive concurrent marking collector on the old
                 generation. This design crucially relies on the ML
                 compile-time distinction between mutable and immutable
                 objects.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; design; experimentation; languages;
                 performance",
  subject =      "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Concurrent programming
                 structures. {\bf D.3.4} Software, PROGRAMMING
                 LANGUAGES, Processors, Compilers. {\bf D.3.2} Software,
                 PROGRAMMING LANGUAGES, Language Classifications, LML.",
}

@Article{Eager:1993:CER,
  author =       "Derek L. Eager and John Jahorjan",
  title =        "Chores: Enhanced Run-Time Support for Shared-Memory
                 Parallel Computing",
  journal =      j-TOCS,
  volume =       "11",
  number =       "1",
  pages =        "1--32",
  month =        feb,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p1-eager/",
  abstract =     "Parallel computing is increasingly important in the
                 solution of large-scale numerical problems. The
                 difficulty of efficiently hand-coding parallelism, and
                 the limitations of parallelizing compilers, have
                 nonetheless restricted its use by scientific
                 programmers. In this paper we propose a new paradigm,
                 {\em chores}, for the run-time support of parallel
                 computing on shared-memory multiprocessors. We consider
                 specifically uniform memory access shared-memory
                 environments, although the chore paradigm should also
                 be appropriate for use within the clusters of a
                 large-scale nonuniform memory access machine. We argue
                 that chore systems attain both the high efficiency of
                 compiler approaches for the common case of data
                 parallelism, and the flexibility and performance of
                 user-level thread approaches for functional
                 parallelism. These benefits are achieved within a
                 single, simple conceptual model that almost entirely
                 relieves the programmer and compiler from concerns of
                 granularity, scheduling, and enforcement of
                 synchronization constraints. Measurements of a
                 prototype implementation demonstrate that the chore
                 model can be supported more efficiently than can
                 traditional approaches to either data or functional
                 parallelism alone.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management. {\bf D.4.9} Software, OPERATING SYSTEMS,
                 Systems Programs and Utilities. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS. {\bf
                 C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS.",
}

@MastersThesis{Estep:1993:LMM,
  author =       "James L. Estep",
  title =        "Lightweight multithreaded multimedia conference
                 server",
  type =         "Thesis ({M.S.})",
  school =       "West Virginia University",
  address =      "Morgantown, WV, USA",
  pages =        "vi + 57",
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Electronic data processing -- Distributed processing;
                 Multimedia systems",
}

@PhdThesis{Fan:1993:LMC,
  author =       "Xiaoming Fan",
  title =        "Latency-directed multithreaded computation and its
                 architectural support",
  type =         "Thesis ({Ph.D.})",
  school =       "Universit{\"a}t Hamburg",
  address =      "Aachen, Germany",
  pages =        "xi + 174 + 22 + 11",
  year =         "1993",
  ISBN =         "3-8265-0021-0",
  ISBN-13 =      "978-3-8265-0021-3",
  ISSN =         "0945-0807",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Summary in German.",
  series =       "Berichte aus der Informatik",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Parallel processing (Electronic
                 computers)",
}

@Article{Gao:1993:DMA,
  author =       "Guang Gao and Jean-Luc Gaudiot and Lubomir Bic",
  title =        "Dataflow and Multithreaded Architectures: {Guest
                 Editors}' Introduction",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "18",
  number =       "3",
  pages =        "271--??",
  month =        jul,
  year =         "1993",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Apr 12 16:10:59 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  xxnote =       "Issue missing from UofUtah Marriott Library??",
}

@Article{Gao:1993:EHD,
  author =       "G. R. Gao",
  title =        "An Efficient Hybrid Dataflow Architecture Model",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "19",
  number =       "4",
  pages =        "293--307",
  month =        dec,
  year =         "1993",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1993.1113",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:53 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C6110P (Parallel
                 programming)C6150N (Distributed systems); C6150C
                 (Compilers, interpreters and other processors)",
  corpsource =   "Adv. Comput. Archit. and Program Structures Group,
                 Montreal Univ., Que., Canada",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "architecture technique; compiling paradigm; concurrent
                 operation; conventional; data-driven instruction;
                 data-driven scheduling scheme; dataflow computers;
                 dataflow software pipelining; efficient hybrid dataflow
                 architecture model; execution; fast pipelined
                 instruction; fine-grain parallelism; hybrid; limited
                 balancing; loop parallelism; multiple instruction;
                 parallel architectures; parallel programming; pipeline;
                 processing; program compilers; scheduling; simple
                 greedy runtime; space efficiency; threads",
  treatment =    "P Practical",
}

@Book{Gao:1993:SID,
  author =       "Guang R. Gao and Jean-Luc Gaudiot and Lubomir Bic",
  title =        "Special issue on dataflow and multithreaded
                 architectures",
  publisher =    pub-AP,
  address =      pub-AP:adr,
  pages =        "271--389",
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Journal of parallel and distributed computing; v. 18,
                 no. 3",
  acknowledgement = ack-nhfb,
}

@InProceedings{Giering:1993:IAF,
  author =       "E. W. Giering and F. Mueller and T. P. Baker",
  title =        "Implementing {Ada 9X} Features using {POSIX} Threads:
                 Design Issues",
  crossref =     "ACM:1993:TCS",
  pages =        "214--228",
  year =         "1993",
  bibdate =      "Sat Jul 05 17:12:34 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Gildea:1993:MTX,
  author =       "Stephen Gildea",
  title =        "Multi-Threaded {Xlib}",
  journal =      j-X-RESOURCE,
  volume =       "5",
  number =       "1",
  pages =        "159--166",
  month =        jan,
  year =         "1993",
  CODEN =        "XRESEA",
  ISBN =         "1-56592-020-1",
  ISBN-13 =      "978-1-56592-020-0",
  ISSN =         "1058-5591",
  bibdate =      "Tue Mar 23 12:38:27 1993",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The X Resource",
}

@Article{Hauser:1993:UTI,
  author =       "Carl Hauser and Christian Jacobi and Marvin Theimer
                 and Brent Welch and Mark Weiser",
  title =        "Using threads in interactive systems: a case study",
  journal =      j-OPER-SYS-REV,
  volume =       "27",
  number =       "5",
  pages =        "94--105",
  month =        dec,
  year =         "1993",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Hayden:1993:BIC,
  author =       "Charles Hayden",
  title =        "A brief introduction to {Concurrent Pascal}",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "3",
  pages =        "353--354",
  month =        mar,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:34 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/154766/p353-hayden/",
  abstract =     "Concurrent Pascal is designed for writing concurrent
                 programs such as operating systems and real-time
                 monitoring systems on shared-memory computers. A
                 separate language, Sequential Pascal, is used as the
                 language for applications programs run by operating
                 systems written in Concurrent Pascal. Both languages
                 are extensions of Wirth's Pascal, and share a common
                 threaded code interpreter. The article describes how
                 Concurrent Pascal differs from Wirth's Pascal.",
  acknowledgement = ack-nhfb,
  affiliation =  "AT and T Bell Labs., Middletown, NJ, USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages)",
  confdate =     "20-23 April 1993",
  conflocation = "Cambridge, MA, USA",
  confname =     "HOPL-II. The second ACM SIGPLAN conference on History
                 of programming languages, April 20--23, 1993,
                 Cambridge, MA",
  confsponsor =  "ACM",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Concurrent Pascal; languages; Operating systems;
                 Real-time monitoring systems; Sequential Pascal;
                 Shared-memory computers; Threaded code interpreter",
  subject =      "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Concurrent Pascal. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Pascal. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Procedures, functions, and subroutines.",
  thesaurus =    "Parallel languages; Pascal",
}

@Article{Hidaka:1993:MTC,
  author =       "Yasuo Hidaka and Hanpei Koike and Hidehiko Tanaka",
  title =        "Multiple threads in cyclic register windows",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "131--142",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Hsieh:1993:CME,
  author =       "Wilson C. Hsieh and Paul Wang and William E. Weihl",
  title =        "Computation migration: enhancing locality for
                 distributed-memory parallel systems",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "7",
  pages =        "239--248",
  month =        jul,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:39 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Computation migration is a technique that is based on
                 compile-time program transformation, for accessing
                 remote data in a distributed-memory parallel system. In
                 contrast with RPC-style access, where the access is
                 performed remotely, and with data migration, where the
                 data is moved so that it is local, computation
                 migration moves put of the current thread to the
                 processor where the data resides. The access is
                 performed at the remote processor, and the migrated
                 thread portion continues to run on that same processor;
                 this makes subsequent accesses in the thread portion
                 local. The authors describe an implementation of
                 computation migration that consists of two parts: a
                 implementation that migrates single activation frames,
                 and a high-level language annotation that allows a
                 programmer to express when migration is desired. They
                 performed experiments using two applications; these
                 experiments demonstrate that computation migration is a
                 valuable alternative to RPC and data migration.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. of Comput. Sci., MIT, Cambridge, MA, USA",
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6150C (Compilers, interpreters and
                 other processors)",
  confdate =     "19-22 May 1993",
  conflocation = "San Diego, CA, USA",
  confsponsor =  "ACM",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Compile-time program transformation; Computation
                 migration; Current thread; Distributed-memory parallel
                 system; High-level language annotation; Remote data;
                 Remote processor; Single activation frames",
  thesaurus =    "Distributed memory systems; Parallel programming;
                 Program compilers; Storage management",
}

@Article{Huelsbergen:1993:CCG,
  author =       "Lorenz Huelsbergen and James R. Larus",
  title =        "A concurrent copying garbage collector for languages
                 that distinguish (im)mutable data",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "7",
  pages =        "73--82",
  month =        jul,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:39 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Wisconsin-Madison Univ., WI,
                 USA",
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6150C (Compilers, interpreters and
                 other processors); C6150N (Distributed systems)",
  confdate =     "19-22 May 1993",
  conflocation = "San Diego, CA, USA",
  confsponsor =  "ACM",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Concurrent collection; Concurrent compacting garbage
                 collector; Garbage-collection pauses; Immutable data;
                 Minimal mutator/collector synchronization; Multiple
                 mutator threads; Mutable data; Pure functional
                 languages; Shared-memory parallel computers; Standard
                 ML compiler",
  thesaurus =    "Parallel programming; Program compilers; Shared memory
                 systems; Storage allocation; Storage management",
}

@InProceedings{Klarlund:1993:GT,
  author =       "Nils Klarlund and Michael I. Schwartzbach",
  title =        "Graph types",
  crossref =     "ACM:1993:CRT",
  pages =        "196--205",
  year =         "1993",
  bibdate =      "Mon May 3 12:45:53 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p196-klarlund/",
  abstract =     "Recursive data structures are abstractions of simple
                 records and pointers. They impose a shape invariant,
                 which is verified at compile-time and exploited to
                 automatically generate code for building, copying,
                 comparing, and traversing values without loss of
                 efficiency. However, such values are always tree
                 shaped, which is a major obstacle to practical use. We
                 propose a notion of graph types, which allow common
                 shapes, such as doubly-linked lists or threaded trees,
                 to be expressed concisely and efficiently. We define
                 regular languages of routing expressions to specify
                 relative addresses of extra pointers in a canonical
                 spanning tree. An efficient algorithm for computing
                 such addresses is developed. We employ a second-order
                 monadic logic to decide well-formedness of graph type
                 specifications. This logic can also be used for
                 automated reasoning about pointer structures.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; languages; theory",
  subject =      "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS
                 OF PROGRAMS, Studies of Program Constructs, Type
                 structure. {\bf D.3.3} Software, PROGRAMMING LANGUAGES,
                 Language Constructs and Features, Data types and
                 structures. {\bf F.2.2} Theory of Computation, ANALYSIS
                 OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical
                 Algorithms and Problems, Computations on discrete
                 structures. {\bf G.2.2} Mathematics of Computing,
                 DISCRETE MATHEMATICS, Graph Theory, Trees.",
}

@InProceedings{Koontz:1993:PBM,
  author =       "K. W. Koontz",
  title =        "Port buffers: a {Mach IPC} optimization for handling
                 large volumes of small messages",
  crossref =     "USENIX:1993:PUMb",
  pages =        "89--102",
  year =         "1993",
  bibdate =      "Sat Sep 28 18:52:45 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Appl. Phys. Lab., Johns Hopkins Univ., Laurel, MD,
                 USA",
  classification = "C6150N (Distributed systems)",
  keywords =     "Communications mechanism; Context switches;
                 Distributed systems; Ethernet; High-speed networks;
                 Kernel calls; Local transfer rates; Mach IPC
                 optimization; Mach kernel; Multi-threaded support;
                 Network utilization; Nonshared memory parallel
                 architectures; Port buffers; Staleness feature",
  thesaurus =    "Buffer storage; Electronic messaging; Network
                 operating systems; Optimisation; Remote procedure
                 calls",
}

@Article{Lee:1993:TW,
  author =       "David Lee",
  title =        "Threads for {Windows} 3",
  journal =      j-DDJ,
  volume =       "18",
  number =       "10",
  pages =        "84--??",
  month =        "Fall",
  year =         "1993",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:44 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  note =         "Special Issue: Windows Sourcebook.",
  abstract =     "Unlike NT, Windows 3 doesn't provide direct support
                 for threads. With the techniques David illustrates
                 here, you can implement non-preemptive threads in
                 Windows 3.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Lim:1993:WAS,
  author =       "Beng-Hong Lim and Anant Agarwal",
  title =        "Waiting Algorithms for Synchronization in Large-Scale
                 Multiprocessors",
  journal =      j-TOCS,
  volume =       "11",
  number =       "3",
  pages =        "253--294",
  month =        aug,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p253-lim/",
  abstract =     "Through analysis and experiments, this paper
                 investigates two-phase waiting algorithms to minimize
                 the cost of waiting for synchronization in large-scale
                 multiprocessors. In a two-phase algorithm, a thread
                 first waits by polling a synchronization variable. If
                 the cost of polling reaches a limit {\em Lpoll\/} and
                 further waiting is necessary, the thread is blocked,
                 incurring an additional fixed cost, {\em B}. The choice
                 of {\em Lpoll\/} is a critical determinant of the
                 performance of two-phase algorithms. We focus on
                 methods for statically determining {\em Lpoll\/}
                 because the run-time overhead of dynamically
                 determining {\em Lpoll\/} can be comparable to the cost
                 of blocking in large-scale multiprocessor systems with
                 lightweight threads. Our experiments show that {\em
                 always-block\/} ({\em Lpoll\/} = 0) is a good waiting
                 algorithm with performance that is usually close to the
                 best of the algorithms compared. We show that even
                 better performance can be achieved with a static choice
                 of {\em Lpoll\/} based on knowledge of likely wait-time
                 distributions. Motivated by the observation that
                 different synchronization types exhibit different
                 wait-time distributions, we prove that a static choice
                 of {\em Lpoll\/} can yield close to optimal on-line
                 performance against an adversary that is restricted to
                 choosing wait times from a fixed family of probability
                 distributions. This result allows us to make an optimal
                 static choice of {\em Lpoll\/} based on synchronization
                 type. For exponentially distributed wait times, we
                 prove that setting {\em Lpoll\/} = 1n(e-1){\em B\/}
                 results in a waiting cost that is no more than {\em
                 e/(e-1)\/} times the cost of an optimal off-line
                 algorithm. For uniformly distributed wait times, we
                 prove that setting {\em L\/}poll=1/2(square root of 5
                 -1){\em B\/} results in a waiting cost that is no more
                 than (square root of 5 + 1)/2 (the golden ratio) times
                 the cost of an optimal off-line algorithm. Experimental
                 measurements of several parallel applications on the
                 Alewife multiprocessor simulator corroborate our
                 theoretical findings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; performance; theory",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Mutual
                 exclusion. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Stochastic analysis.",
}

@Article{McCann:1993:DPA,
  author =       "Cathy McCann and Raj Vaswani and John Zahorjan",
  title =        "A Dynamic Processor Allocation Policy for
                 Multiprogrammed Shared-Memory Multiprocessors",
  journal =      j-TOCS,
  volume =       "11",
  number =       "2",
  pages =        "146--178",
  month =        may,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p146-mccann/",
  abstract =     "We propose and evaluate empirically the performance of
                 a dynamic processor-scheduling policy for
                 multiprogrammed shared-memory multiprocessors. The
                 policy is dynamic in that it reallocates processors
                 from one parallel job to another based on the currently
                 realized parallelism of those jobs. The policy is
                 suitable for implementation in production systems in
                 that: ---It interacts well with very efficient
                 user-level thread packages, leaving to them many
                 low-level thread operations that do not require kernel
                 intervention. ---It deals with thread blocking due to
                 user I/O and page faults. ---It ensures fairness in
                 delivering resources to jobs. ---Its performance,
                 measured in terms of average job response time, is
                 superior to that of previously proposed schedulers,
                 including those implemented in existing systems. It
                 provides good performance to very short, sequential
                 (e.g., interactive) requests. We have evaluated our
                 scheduler and compared it to alternatives using a set
                 of prototype implementations running on a Sequent
                 Symmetry multiprocessor. Using a number of parallel
                 applications with distinct qualitative behaviors, we
                 have both evaluated the policies according to the major
                 criterion of overall performance and examined a number
                 of more general policy issues, including the advantage
                 of ``space sharing'' over ``time sharing'' the
                 processors of a multiprocessor, and the importance of
                 cooperation between the kernel and the application in
                 reallocating processors between jobs. We have also
                 compared the policies according to other criteia
                 important in real implementations, in particular,
                 fairness and respone time to short, sequential
                 requests. We conclude that a combination of performance
                 and implementation considerations makes a compelling
                 case for our dynamic scheduling policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors).",
}

@Article{Morrisett:1993:PLP,
  author =       "J. Gregory Morrisett and Andrew P. Tolmach",
  title =        "Procs and locks: a portable multiprocessing platform
                 for {Standard ML} of {New Jersey}",
  journal =      j-SIGPLAN,
  volume =       "28",
  number =       "7",
  pages =        "198--207",
  month =        jul,
  year =         "1993",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:39 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A portable platform has been built for running
                 Standard ML of New Jersey programs on multiprocessors.
                 It can be used to implement user-level thread packages
                 for multiprocessors within the ML language with
                 first-class continuations. The platform supports
                 experimentation with different thread scheduling
                 policies and synchronization constructs. It has been
                 used to construct a Modula-3 style thread package and a
                 version of Concurrent ML, and has been ported to three
                 different multiprocessors running variants of Unix. The
                 authors describe the platform's design, implementation,
                 and performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Carnegie Mellon Univ., Pittsburg, PA, USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150C (Compilers, interpreters and other
                 processors)",
  confdate =     "19-22 May 1993",
  conflocation = "San Diego, CA, USA",
  confsponsor =  "ACM",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Concurrent ML; First-class continuations; Functional
                 language; Modula-3 style thread package; New Jersey
                 programs; Portable multiprocessing platform; Portable
                 platform; Standard ML; Synchronization constructs;
                 Thread scheduling policies; User-level thread
                 packages",
  thesaurus =    "Multiprocessing systems; Parallel languages; Parallel
                 programming; Scheduling",
}

@Article{Najjar:1993:QAD,
  author =       "Walid A. Najjar and A. P. Wim Bohm and W. Marcus
                 Miller",
  title =        "A Quantitative Analysis of Dataflow Program Execution
                 --- Preliminaries to a Hybrid Design",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "18",
  number =       "3",
  pages =        "314--326",
  month =        jul,
  year =         "1993",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1993.1067",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:52 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110P
                 (Parallel programming)",
  corpsource =   "Dept. of Comput. Sci., Colorado State Univ., Fort
                 Collins, CO, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "benchmarks; dataflow program execution; dynamic
                 measure; fine grain intrathread locality; instruction
                 level locality; parallel programming; software
                 metrics",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Natarajan:1993:PVM,
  author =       "Venkat Natarajan and Derek Chiou and Boon Seong Ang",
  title =        "Performance visualization on {Monsoon}",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "18",
  number =       "2",
  pages =        "169--180",
  month =        jun,
  year =         "1993",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1993.1054",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:52 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessor systems and techniques); C5470
                 (Performance evaluation and testing); C7430 (Computer
                 engineering)",
  corpsource =   "Motorola Cambridge Res. Center, MA, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "algorithm; application program; compiler; computer
                 evaluation; data analysis; data collection; data
                 visualisation; MIT; Monsoon; Motorola; multiprocessor
                 machine; multithreaded; operating system; parallel
                 machine; parallel machines; performance evaluation;
                 performance evaluation tool; programming language;
                 visualization",
  treatment =    "P Practical",
}

@InProceedings{Odersky:1993:CNA,
  author =       "Martin Odersky and Dan Rabin and Paul Hudak",
  title =        "Call by name, assignment, and the lambda calculus",
  crossref =     "ACM:1993:CRT",
  pages =        "43--56",
  year =         "1993",
  bibdate =      "Mon May 3 12:45:53 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p43-odersky/",
  abstract =     "We define an extension of the call-by-name lambda
                 calculus with additional constructs and reduction rules
                 that represent mutable variables and assignments. The
                 extended calculus has neither a concept of an explicit
                 store nor a concept of evaluation order; nevertheless,
                 we show that programs in the calculus can be
                 implemented using a single-threaded store. We also show
                 that the new calculus has the Church--Rosser property
                 and that it is a conservative extension of classical
                 lambda calculus with respect to operational
                 equivalence; that is, all algebraic laws of the
                 functional subset are preserved.",
  acknowledgement = ack-nhfb,
  keywords =     "languages; theory",
  subject =      "{\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC
                 AND FORMAL LANGUAGES, Mathematical Logic, Lambda
                 calculus and related systems. {\bf F.3.3} Theory of
                 Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies
                 of Program Constructs, Type structure.",
}

@Article{Plauger:1993:MCS,
  author =       "Dave Plauger",
  title =        "Making {C++} Save for Threads",
  journal =      j-CUJ,
  volume =       "11",
  number =       "2",
  pages =        "58--??",
  month =        feb,
  year =         "1993",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@InProceedings{Raghunath:1993:DIN,
  author =       "M. T. Raghunath and Abhiram Ranade",
  title =        "Designing Interconnection Networks for Multi-Level
                 Packaging",
  crossref =     "IEEE:1993:PSP",
  pages =        "772--781",
  year =         "1993",
  bibdate =      "Wed Apr 15 12:04:03 MDT 1998",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of California",
  affiliationaddress = "Berkeley, CA, USA",
  classification = "723; C5220P (Parallel architecture); C5440
                 (Multiprocessing systems)",
  corpsource =   "Comput. Sci. Div., California Univ., Berkeley, CA,
                 USA",
  keywords =     "communication bandwidth; complete graphs; Computer
                 networks; generic set; global communication
                 performance; high bandwidth channels; high degree
                 deBruijn graphs; Interconnection network design;
                 interconnection networks design; Large scale parallel
                 machines; large scale parallel machines; latencies;
                 Multilevel packaging; multilevel packaging;
                 multiprocessor interconnection networks;
                 multithreading; network organizations; network
                 topology; packaging; packaging constraints; packaging
                 hierarchy; packaging restrictions; packaging
                 technology; Parallel processing systems; Random traffic
                 model; random traffic model",
  sponsororg =   "IEEE; ACM SIGARCH",
  treatment =    "P Practical",
}

@MastersThesis{Rajagopal:1993:DMI,
  author =       "Arjun Rajagopal",
  title =        "Design of a multithreaded instruction cache for a
                 hyperscalar processor",
  type =         "Thesis ({M.S.})",
  school =       "Department of Electrical Engineering, Texas A\&M
                 University",
  address =      "College Station, TX, USA",
  pages =        "ix + 84",
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Major electrical engineering",
}

@InProceedings{Saxena:1993:PMS,
  author =       "Sunil Saxena and J. Kent Peacock and Fred Yang and
                 Vijaya Verma and Mohan Krishnan",
  title =        "Pitfalls in Multithreading {SVR4 STREAMS} and Other
                 Weightless Processes",
  crossref =     "USENIX:1993:PWU",
  pages =        "85--96",
  month =        "Winter",
  year =         "1993",
  bibdate =      "Tue Oct 22 08:14:49 2002",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/sd93/",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Multiprocessor Consortium",
}

@InProceedings{Schmidtmann:1993:DIM,
  author =       "Carl Schmidtmann and Michael Tao and Steven Watt",
  title =        "Design and Implementation of a Multi-Threaded {Xlib}",
  crossref =     "USENIX:1993:PWU",
  pages =        "193--203",
  month =        "Winter",
  year =         "1993",
  bibdate =      "Tue Oct 22 08:16:35 2002",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/sd93/",
  acknowledgement = ack-nhfb,
  affiliation =  "Consultant to Digital Equipment Corporation; Sun
                 Microsystems; Consultant to Xerox Corporation",
}

@MastersThesis{Srinivasan:1993:SDS,
  author =       "Sumathi Srinivasan",
  title =        "System design and simulation for the {Demus-2}
                 multithreaded processor",
  type =         "Thesis ({M. Eng.})",
  school =       "Department of Electrical and Computer Engineering,
                 McMaster University",
  address =      "Hamilton, ON, Canada",
  pages =        "x + 109",
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Computers, Pipeline; McMaster
                 University. -- Dissertations; Parallel processing
                 (Electronic computers)",
}

@Article{Volkman:1993:CCP,
  author =       "Victor R. Volkman",
  title =        "Convert {C} Programs into Multithreaded Applications",
  journal =      j-CUJ,
  volume =       "11",
  type =         "User Report",
  number =       "4",
  pages =        "87--??",
  month =        apr,
  year =         "1993",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@Article{Volkman:1993:CDB,
  author =       "Victor R. Volkman and John English",
  title =        "Class {{\tt DOSThread}}: a Base Class for
                 Multithreaded {DOS} Programs",
  journal =      j-CUJ,
  volume =       "11",
  type =         "CUG library disk documentation",
  number =       "12",
  pages =        "113--??",
  month =        dec,
  year =         "1993",
  ISSN =         "0898-9788",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C Users Journal",
}

@Article{Waldspurger:1993:RRF,
  author =       "Carl A. Waldspurger and William E. Weihl",
  title =        "Register relocation: flexible contexts for
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "120--130",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@TechReport{Young-Myers:1993:ESTa,
  author =       "Helene Young-Myers and Louiqa Raschid",
  title =        "An experimental study of three dataflow paradigms in
                 multithreaded database transitive closure algorithms on
                 shared memory multiprocessors",
  type =         "Technical report",
  number =       "CS-TR-3060; UMIACS-TR-93-33",
  institution =  inst-U-MARYLAND,
  address =      inst-U-MARYLAND:adr,
  pages =        "21",
  month =        apr,
  year =         "1993",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "To appear in a special issue of the Journal of
                 Parallel and Distributed Computing on Dataflow and
                 Multithreaded Architectures, July, 1993.",
  abstract =     "This paper describes an experimental study of three
                 dataflow paradigms, namely, no dataflow, pipelined
                 dataflow, and network dataflow, in multithreaded
                 database transitive closure algorithms on shared memory
                 multiprocessors. This study shows that dataflow
                 paradigm directly influences performance parameters
                 such as the amount of interthread communication, how
                 data are partitioned among the threads, whether access
                 to each page of data is exclusive or shared, whether
                 locks are needed for concurrency control, and how
                 calculation termination is detected. The algorithm
                 designed with no dataflow outperforms the algorithms
                 with dataflow. Approximately linear speedup is achieved
                 by the no dataflow algorithm with sufficient workload
                 and primary memory. An exclusive access working set
                 model and a shared access working set model describe
                 the interactions between two or more threads' working
                 sets when access to each page of data is exclusive or
                 shared among the threads, respectively. These models
                 are experimentally verified.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the National Science
                 Foundation.",
  keywords =     "Data flow computing; Multiprocessors",
}

@Article{Young-Myers:1993:ESTb,
  author =       "Helene Young-Myers and Louiqa Raschid",
  title =        "An Experimental Study of Three Dataflow Paradigms in
                 Multithreaded Database Transitive Closure Algorithms on
                 Shared Memory Multiprocessors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "18",
  number =       "3",
  pages =        "371--389",
  month =        jul,
  year =         "1993",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1993.1071",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:52 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing); C6160 (Database management
                 systems (DBMS))",
  corpsource =   "Maryland Univ., College Park, MD, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "architectures; calculation termination; concurrency
                 control; database management systems; dataflow;
                 dataflow paradigms; exclusive access; interthread
                 communication; linear; network; no dataflow; parallel;
                 performance evaluation; performance parameters;
                 pipelined dataflow; shared access; shared memory
                 systems; speedup",
  treatment =    "P Practical",
}

@InProceedings{Alfieri:1994:EKI,
  author =       "R. A. Alfieri",
  title =        "An Efficient Kernel-Based Implementation of {POSIX}
                 Threads",
  crossref =     "Anonymous:1994:USC",
  pages =        "59--72",
  year =         "1994",
  bibdate =      "Sat May 25 07:59:58 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Anonymous:1994:DCT,
  author =       "Anonymous",
  title =        "On the Design of {Chant}: a Talking Threads
                 Package",
  crossref =     "IEEE:1994:PSW",
  pages =        "350--359",
  year =         "1994",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Anonymous:1994:MDP,
  author =       "Anonymous",
  title =        "{Multiprocessor desktops are proliferating, even
                 though there remains a shortage of multithreaded
                 applications for them}",
  journal =      j-OPEN-SYSTEMS-TODAY,
  volume =       "165",
  pages =        "60--??",
  month =        dec,
  year =         "1994",
  ISSN =         "1061-0839",
  bibdate =      "Fri Jan 26 17:24:01 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Open Systems Today",
}

@Article{Anonymous:1994:SIP,
  author =       "Anonymous",
  title =        "Special issue: panel sessions of the {1991 Workshop on
                 Multithreaded Computers, November 22, 1991,
                 Albuquerque, New Mexico, in conjunction with
                 Supercomputing '91}",
  journal =      "Computer architecture news",
  volume =       "22",
  number =       "1",
  pages =        "2--33",
  year =         "1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Anonymous:1994:WMC,
  author =       "Anonymous",
  title =        "{Wanted: The Multithreaded CIO}",
  journal =      j-DATAMATION,
  volume =       "40",
  number =       "8",
  pages =        "34--??",
  day =          "15",
  month =        apr,
  year =         "1994",
  CODEN =        "DTMNAT",
  ISSN =         "0011-6963",
  bibdate =      "Sat Jan 27 07:35:21 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Technician or business manager? If you want to be a
                 CIO, you better be both. Add to that a host of
                 communications skills and an ability to travel in
                 diverse circles, and you're on your way to being the
                 Multithreaded CIO of the 1990s.",
  acknowledgement = ack-nhfb,
  fjournal =     "Datamation",
}

@InProceedings{Baker:1994:EPP,
  author =       "T. P. Baker and Frank Mueller and Viresh Rustagi",
  title =        "Experience with a Prototype of the {POSIX} ``Minimal
                 Realtime System Profile''",
  crossref =     "IEEE:1994:ROS",
  pages =        "12--17",
  year =         "1994",
  bibdate =      "Sat May 25 07:59:58 MDT 1996",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper describes experience prototyping the
                 proposed IEEE standard `minimal realtime system
                 profile', whose primary component is support for
                 real-time threads. It provides some background,
                 describes the implementation, and reports preliminary
                 performance measurements.",
  acknowledgement = ack-nhfb,
  affiliation =  "Florida State Univ",
  affiliationaddress = "Tallahassee, FL, USA",
  classification = "722.4; 723.1; 723.1.1; 723.2",
  conference =   "Proceedings of the 11th IEEE Workshop on Real-Time
                 Operating Systems and Software",
  conferenceyear = "1994",
  journalabr =   "Proc IEEE Workshop Real Time Oper Syst Software",
  keywords =     "Computer operating systems; Computer software
                 portability; Data structures; High level languages;
                 Interfaces (computer); Mesa programming language;
                 Minimal real time system profile; Program processors;
                 Real time systems; Thread; Thread management; Thread
                 priority scheduling",
  meetingaddress = "Seattle, WA, USA",
  meetingdate =  "May 18--19 1994",
  meetingdate2 = "05/18--19/94",
  publisherinfo = "Computer Society Press",
  sponsor =      "IEEE Computer Society",
}

@Article{Baquero:1994:CAC,
  author =       "Carlos Baquero and Francisco Moura",
  title =        "Concurrency Annotations in {C++}",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "7",
  pages =        "61--67",
  month =        jul,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:53 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6140D (High level languages)",
  corpsource =   "DI/INESC, Minho Univ., Portugal",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "access flag; C language; C++; concurrency annotations;
                 inheritance; inheritance chain; language extension;
                 method code; method invocations; method predicates;
                 multiple threads; object-oriented languages; parallel
                 languages; shared-memory multiprocessor system;
                 synchronisation; synchronization code; synchronization
                 mechanisms",
  treatment =    "P Practical",
}

@InProceedings{Blumofe:1994:SMC,
  author =       "R. D. Blumofe and C. E. Leiserson",
  title =        "Scheduling multithreaded computations by work
                 stealing",
  crossref =     "Goldwasser:1994:PAS",
  pages =        "356--368",
  year =         "1994",
  bibdate =      "Thu Apr 5 06:13:51 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Buendgen:1994:MAT,
  author =       "R. Buendgen and M. Goebel and W. Kuechlin",
  title =        "Multi-Threaded {AC} Term Rewriting",
  crossref =     "Hong:1994:FIS",
  pages =        "84--93",
  year =         "1994",
  bibdate =      "Thu Mar 12 11:28:58 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/issac.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Buendgen:1994:MTA,
  author =       "R. Buendgen and M. Goebel and W. Kuechlin",
  title =        "Multi-Threaded {AC} Term Rewriting",
  crossref =     "Hong:1994:FIS",
  pages =        "84--93",
  year =         "1994",
  bibdate =      "Thu Mar 12 11:28:58 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/issac.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Buhr:1994:TRM,
  author =       "R. J. A. Buhr and R. S. Casselman",
  title =        "Timethread-Role Maps for Object-Oriented Design of
                 Real-Time-and-Distributed Systems",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "10",
  pages =        "301--301",
  month =        oct,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 24 18:36:02 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6150N
                 (Distributed systems)",
  conflocation = "Portland, OR, USA; 23-27 Oct. 1994",
  conftitle =    "Ninth Annual Conference on Object-Oriented Programming
                 Systems, Languages, and Applications. OOPSLA '94",
  corpsource =   "Dept. of Syst. and Comput. Eng., Carleton Univ.,
                 Ottawa, Ont., Canada",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; distributed processing; distributed
                 systems; dynamic structure; end-to-end responsibility
                 paths; object-oriented approach; object-oriented
                 design; object-oriented design methods; object-oriented
                 methods; object-oriented programming; real-time
                 systems; real-time systems oriented programming;
                 responsibility-driven design; timethread-role maps",
  sponsororg =   "ACM",
  treatment =    "P Practical",
}

@InProceedings{Bundgen:1994:FPC,
  author =       "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and
                 Wolfgang K{\"u}chlin",
  title =        "A fine-grained parallel completion procedure",
  crossref =     "ACM:1994:IPI",
  pages =        "269--277",
  year =         "1994",
  bibdate =      "Thu Mar 12 08:41:19 MST 1998",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/issac/190347/p269-bundgen/",
  abstract =     "We present a parallel Knuth--Bendix completion
                 algorithm where the inner loop, deriving the
                 consequences of adding a new rule to the system, is
                 multithreaded. The selection of the best new rule in
                 the outer loop, and hence the completion strategy, is
                 exactly the same as for the sequential algorithm. Our
                 implementation, which is within the PARSAC-2 parallel
                 symbolic computation system, exhibits good parallel
                 speedups on a standard multiprocessor workstation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
                 Germany",
  classification = "C4210L (Formal languages and computational
                 linguistics); C4240P (Parallel programming and
                 algorithm theory); C6130 (Data handling techniques);
                 C6150N (Distributed systems software); C7310
                 (Mathematics computing)",
  keywords =     "algorithms; Fine grained parallel completion
                 procedure; Fine-grained parallel completion procedure;
                 Multithreaded inner loop; Parallel Knuth--Bendix
                 completion algorithm; Parallel speedups; PARSAC-2
                 parallel symbolic computation system; Standard
                 multiprocessor workstation",
  subject =      "{\bf I.1.2} Computing Methodologies, SYMBOLIC AND
                 ALGEBRAIC MANIPULATION, Algorithms, Algebraic
                 algorithms. {\bf I.1.0} Computing Methodologies,
                 SYMBOLIC AND ALGEBRAIC MANIPULATION, General. {\bf
                 I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC
                 MANIPULATION, Languages and Systems. {\bf F.4.2} Theory
                 of Computation, MATHEMATICAL LOGIC AND FORMAL
                 LANGUAGES, Grammars and Other Rewriting Systems,
                 Parallel rewriting systems. {\bf F.1.2} Theory of
                 Computation, COMPUTATION BY ABSTRACT DEVICES, Modes of
                 Computation, Parallelism and concurrency.",
  thesaurus =    "Parallel algorithms; Parallel machines; Rewriting
                 systems; Symbol manipulation",
}

@Article{Carter:1994:HSF,
  author =       "Nicholas P. Carter and Stephen W. Keckler and William
                 J. Dally",
  title =        "Hardware support for fast capability-based
                 addressing",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "11",
  pages =        "319--327",
  month =        nov,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:57 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p319-carter/",
  abstract =     "Traditional methods of providing protection in memory
                 systems do so at the cost of increased context switch
                 time and/or increased storage to record access
                 permissions for processes. With the advent of computers
                 that supported cycle-by-cycle multithreading,
                 protection schemes that increase the time to perform a
                 context switch are unacceptable, but protecting
                 unrelated processes from each other is still necessary
                 if such machines are to be used in non-trusting
                 environments. This paper examines {\em guarded
                 pointers\/}, a hardware technique which uses tagged
                 64-bit pointer objects to implement capability-based
                 addressing. Guarded pointers encode a segment
                 descriptor into the upper bits of every pointer,
                 eliminating the indirection and related performance
                 penalties associated with traditional implementations
                 of capabilities. All processes share a single 54-bit
                 virtual address space, and access is limited to the
                 data that can be referenced through the pointers that a
                 process has been issued. Only one level of address
                 translation is required to perform a memory reference.
                 Sharing data between processes is efficient, and
                 protection states are defined to allow fast protected
                 subsystem calls and create unforgeable data keys.",
  acknowledgement = ack-nhfb,
  classification = "C5310 (Storage system design); C6120 (File
                 organisation); C6150N (Distributed systems software)",
  conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
  conftitle =    "Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI)",
  corpsource =   "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "54- bit virtual address space; address translation;
                 capability based addressing; cycle-by-cycle
                 multithreading; design; fast capability-based
                 addressing; fast protected subsystem calls; guarded
                 pointers; hardware support; hardware technique; memory
                 architecture; memory bit virtual address space; memory
                 reference; memory systems; multiprocessing programs;
                 performance; protection schemes; protection states;
                 segment descriptor; storage allocation; tagged 64-bit
                 pointer objects; theory; unforgeable data keys; virtual
                 storage",
  sponsororg =   "ACM; IEEE Comput. Soc",
  subject =      "{\bf C.0} Computer Systems Organization, GENERAL,
                 Instruction set design. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS.",
  treatment =    "P Practical",
}

@Book{Catanzaro:1994:MSA,
  author =       "Ben J. Catanzaro",
  title =        "Multiprocessor system architectures: a technical
                 survey of multiprocessor\slash multithreaded systems
                 using {SPARC}, multilevel bus architectures and
                 {Solaris} {(SunOS)}",
  publisher =    pub-PHPTR,
  address =      pub-PHPTR:adr,
  pages =        "xxxii + 493",
  year =         "1994",
  ISBN =         "0-13-089137-1",
  ISBN-13 =      "978-0-13-089137-2",
  LCCN =         "QA76.5.C3864 1994",
  bibdate =      "Fri Aug 7 08:29:38 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "computer architecture; multiprocessors; sun
                 computers",
}

@Article{Chase:1994:SPS,
  author =       "Jeffrey S. Chase and Henry M. Levy and Michael J.
                 Feeley and Edward D. Lazowska",
  title =        "Sharing and Protection in a Single-Address-Space
                 Operating System",
  journal =      j-TOCS,
  volume =       "12",
  number =       "4",
  pages =        "271--307",
  month =        nov,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p271-chase/",
  abstract =     "This article explores memory sharing and protection
                 support in Opal, a single-address-space operating
                 system designed for wide-address (64-bit)
                 architectures. Opal threads execute within protection
                 domains in a single shared virtual address space.
                 Sharing is simplified, because addresses are context
                 independent. There is no loss of protection, because
                 addressability and access are independent; the right to
                 access a segment is determined by the protection domain
                 in which a thread executes. This model enables
                 beneficial code-and data-sharing patterns that are
                 currently prohibitive, due in part to the inherent
                 restrictions of multiple address spaces, and in part to
                 Unix programming style. We have designed and
                 implemented an Opal prototype using the Mach 3.0
                 microkernel as a base. Our implementation demonstrates
                 how a single-address-space structure can be supported
                 alongside of other environments on a modern microkernel
                 operating system, using modern wide-address
                 architectures. This article justifies the Opal model
                 and its goals for sharing and protection, presents the
                 system and its abstractions, describes the prototype
                 implementation, and reports experience with integrated
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management. {\bf C.1.3} Computer Systems Organization,
                 PROCESSOR ARCHITECTURES, Other Architecture Styles,
                 Capability architectures**. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Modules, packages. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management. {\bf
                 D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Access controls. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection, Information
                 flow controls. {\bf D.4.7} Software, OPERATING SYSTEMS,
                 Organization and Design. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Measurements. {\bf E.1}
                 Data, DATA STRUCTURES. {\bf E.2} Data, DATA STORAGE
                 REPRESENTATIONS.",
}

@Article{Chaudhry:1994:CMP,
  author =       "Ghulam Chaudhry and Xuechang Li",
  title =        "A case for the multithreaded processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "55--59",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Dennis:1994:MMP,
  author =       "Jack B. Dennis",
  title =        "Machines and Models for Parallel Computing",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "22",
  number =       "1",
  pages =        "47--77",
  month =        feb,
  year =         "1994",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Apr 26 11:04:14 MDT 1997",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=22&issue=1;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessor systems and techniques); C6110 (Systems
                 analysis and programming); C6150N (Distributed
                 systems)",
  corpsource =   "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "concurrency control; dataflow principles; functional
                 programming; general semantic model; memory latency;
                 microprocessors; modular software construction;
                 multithreading; parallel computation; parallel
                 computing models; parallel machines; parallel
                 programming; processor architecture; processor design;
                 RISC; shared memory systems; shared-memory model;
                 superpipelined; superscalar; synchronization",
  treatment =    "P Practical",
}

@Book{Dorfman:1994:EMO,
  author =       "Len Dorfman and Marc J. Neuberger",
  title =        "Effective multithreading in {OS/2}",
  publisher =    pub-MCGRAW-HILL,
  address =      pub-MCGRAW-HILL:adr,
  pages =        "xii + 288",
  year =         "1994",
  ISBN =         "0-07-017841-0 (paperback)",
  ISBN-13 =      "978-0-07-017841-0 (paperback)",
  LCCN =         "QA76.76.O63D6694 1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$34.95",
  acknowledgement = ack-nhfb,
  annote =       "System requirements for computer disk: IBM-compatible
                 PC; 4MB RAM (8MB recommended); OS/2; C compiler such as
                 IBM CSet++ or Borland C++ for OS/2; high-density floppy
                 disk drive; hard disk with 3.1MB free space.",
  keywords =     "Microcomputers -- Operating systems; Operating systems
                 (Computers); OS/2 (Computer file)",
}

@TechReport{Dubey:1994:APM,
  author =       "Pradeep Dubey and Arvind Krishna and M. J. (Michael
                 J.) Flynn",
  title =        "Analytical performance modeling for a spectrum of
                 multithreaded machines",
  type =         "Research report",
  number =       "RC 19549 (85007)",
  institution =  "IBM T. J. Watson Research Center",
  address =      "Yorktown Heights, NY, USA",
  pages =        "27",
  day =          "3",
  month =        may,
  year =         "1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The throughput of pipelined processors suffers due to
                 delays associated with instruction dependencies and
                 memory latencies. Multithreaded architectures try to
                 tolerate such delays by sharing the pipeline with
                 independent instruction threads. This paper proposes a
                 comprehensive analytical framework to quantitate the
                 performance potential of a wide spectrum of
                 multithreaded machines ranging from those that are
                 capable of switching threads every cycle to those that
                 switch threads only on long inter-instruction
                 latencies. For machines in the former category, the
                 proposed analytic model provides an exact solution for
                 pipeline utilization which is significantly better than
                 lower and upper bounds obtainable from simple
                 approximation techniques. Unlike previously published
                 analytic models of such systems, the Markov model
                 developed here accepts a general distribution for the
                 interlock delays with multiple latencies. For machines
                 in the latter category, the paper provides an
                 approximate analytic model which is simpler than
                 previously published analytic models. The models have
                 been verified using previously published analytical and
                 simulation-based results. As compared to the simulation
                 alternative, the models provide a much quicker estimate
                 of pipeline utilization as a function of a number of
                 threads.",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture",
}

@MastersThesis{Gallagher:1994:PLM,
  author =       "William Lynn Gallagher",
  title =        "Performance limitations of the {MTS} multithreaded
                 architecture",
  type =         "Thesis ({M.S. in Engineering})",
  school =       "University of Texas at Austin",
  address =      "Austin, TX, USA",
  pages =        "xiv + 101",
  year =         "1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Gerlhof:1994:MTA,
  author =       "C. A. Gerlhof and A. Kemper",
  title =        "A Multi-Threaded Architecture for Prefetching in
                 Object Bases",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "779",
  pages =        "351--364",
  year =         "1994",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Wed Sep 15 18:44:20 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1994.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
  keywords =     "database technology; EDBT; extending database
                 technology",
}

@Article{Gibson:1994:CMC,
  author =       "Ken Gibson",
  title =        "A {C++} Multitasking Class Library",
  journal =      j-DDJ,
  volume =       "19",
  number =       "5",
  pages =        "28, 30, 32, 34, 96--98",
  month =        may,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:49 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Multithreaded applications that currently execute more
                 than one section of code aren't directly supported by
                 languages such as C++. Ken presents a C++ multitasking
                 class library for MS-DOS that lets you implement a
                 program as a set of concurrent threads.",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "C++ multitasking class library; Concurrent execution;
                 DOS; Embedded processors; Interthread communications;
                 Locator program; Microsoft C++ 7.0; Multithreaded
                 applications; Portability; Processor initialization;
                 Queue class; Real-time device control; Real-time
                 executive; ROMable image; Scheduler object; Semaphore
                 class; Simulation; Thread class; Thread
                 synchronization",
  thesaurus =    "C listings; Multiprogramming; Object-oriented
                 programming; Public domain software; Scheduling;
                 Subroutines",
}

@Article{Giloi:1994:PSA,
  author =       "Wolfgang K. Giloi",
  title =        "Parallel supercomputer architectures and their
                 programming models",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "20",
  number =       "10--11",
  pages =        "1443--1470",
  day =          "3",
  month =        nov,
  year =         "1994",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:13:51 MDT 1999",
  bibsource =    "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=10-11;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=10-11&aid=907",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing)",
  corpsource =   "FIRST, GMD Res. Inst. for Comput. Arch. and Software
                 Eng., Berlin, Germany",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  keywords =     "*T; abstract machine; architectures; DASH; distributed
                 memory; distributed memory systems; distributed shared;
                 hardware architecture; latency hiding; latency
                 minimization; MANNA; memory architectures; message
                 passing; message passing architectures; multi-threaded
                 architectures; parallel; parallel supercomputer
                 architectures; performance; performance evaluation;
                 physically shared memory systems; programming models;
                 scalability; shared memory architectures; shared memory
                 systems; systems; taxonomy; virtual",
  treatment =    "P Practical",
}

@Manual{Haines:1994:DCT,
  author =       "Matthew Haines and David Cronk and Piyush Mehrotra",
  title =        "On the design of chant: a talking threads of package:
                 final report",
  number =       "194903",
  publisher =    pub-NTIS,
  address =      pub-NTIS:adr,
  pages =        "??",
  year =         "1994",
  LCCN =         "NAS 1.26:194903 Govt Pubs",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Shipping list number 94-0861-M.",
  series =       "NASA contractor report",
  acknowledgement = ack-nhfb,
  keywords =     "message processing; messages",
}

@Article{Halstead:1994:PCR,
  author =       "Burt Halstead and David Callahan and Jack Dennis and
                 R. S. Nikhil and Vivek Sarkar",
  title =        "Programming, compilation, and resource management
                 issues for multithreading (panel session {II})",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "19--33",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@InProceedings{Holm:1994:CSP,
  author =       "J. Holm and A. Lain and P. Banerjee",
  title =        "Compilation of Scientific Programs into Multithreaded
                 and Message Driven Computation",
  crossref =     "IEEE:1994:PSH",
  pages =        "518--525",
  year =         "1994",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Iannucci:1994:AII,
  author =       "Robert Iannucci and Anant Agarwal and Bill Dally and
                 Anoop Gupta and Greg Papadopoulos and Burton Smith",
  title =        "Architectural and implementation issues for
                 multithreading (panel session {I})",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "3--18",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Book{Iannucci:1994:MCA,
  editor =       "Robert A. Iannucci and others",
  title =        "Multithreaded computer architecture: a summary of the
                 state of the art",
  volume =       "SECS 0281",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xvi + 400",
  year =         "1994",
  ISBN =         "0-7923-9477-1",
  ISBN-13 =      "978-0-7923-9477-8",
  LCCN =         "QA76.9.A73 M85 1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "The Kluwer international series in engineering and
                 computer science",
  acknowledgement = ack-nhfb,
  keywords =     "computer architecture; Computer architecture;
                 Computers -- Design",
}

@InProceedings{Jeffay:1994:LMT,
  author =       "K. Jeffay",
  title =        "On latency management in time-shared operating
                 systems",
  crossref =     "IEEE:1994:PIW",
  pages =        "86--90",
  year =         "1994",
  bibdate =      "Sat Sep 28 18:52:45 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., North Carolina Univ., Chapel
                 Hill, NC, USA",
  classification = "C6150J (Operating systems); C6150N (Distributed
                 systems)",
  keywords =     "End-to-end latency; Inter-process communication
                 interconnections; Latency management; Multi-threaded
                 applications; Real-Time Mach kernel; Time-shared
                 operating systems; YARTOS kernel",
  thesaurus =    "Message passing; Operating systems [computers];
                 Real-time systems; Scheduling; Time-sharing programs",
}

@Article{Kanalakis:1994:ET,
  author =       "John M. {Kanalakis, Jr.}",
  title =        "Examining {OS/2} 2.1 threads",
  journal =      j-DDJ,
  volume =       "19",
  number =       "1",
  pages =        "74, 76, 78--79, 96",
  month =        jan,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 08:52:50 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "The OS/2 2.1 multitasking model is based on the
                 execution of threads, making it possible for many
                 sections of a single process to execute simultaneously.
                 John examines OS/2's thread architecture, specifically,
                 the scheduling process.",
  acknowledgement = ack-nhfb,
  classification = "C6150J (Operating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "Bias implementation; OS/2 2.1 multitasking model;
                 Round robin scheduling; Scheduling process; Thread
                 architecture; Threads",
  thesaurus =    "Multiprogramming; Operating systems [computers];
                 Scheduling",
}

@Article{Kelly:1994:MBC,
  author =       "Michael Kelly",
  title =        "Multithreading with {OS/2} and {Borland C++}",
  journal =      j-CCCUJ,
  volume =       "12",
  number =       "8",
  pages =        "67--??",
  month =        aug,
  year =         "1994",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Kelly:1994:MOB,
  author =       "Michael Kelly",
  title =        "Multithreading with {OS/2} and {Borland C++}",
  journal =      j-CCCUJ,
  volume =       "12",
  number =       "8",
  pages =        "67--??",
  month =        aug,
  year =         "1994",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@PhdThesis{Kim:1994:FPF,
  author =       "Chinhyun Kim",
  title =        "Functional programming and fine-grain multithreading
                 for high-performance parallel computing",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Southern California",
  address =      "Los Angeles, CA, USA",
  pages =        "xv + 150",
  year =         "1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Kim:1994:HAM,
  author =       "C. Kim and J.-L. Gaudiot",
  title =        "A Hierarchical Activation Management Technique for
                 Fine-Grain Multithreaded Execution",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "817",
  pages =        "577--??",
  year =         "1994",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:52:14 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Krieger:1994:ASF,
  author =       "Orran Krieger and Michael Stumm and Ron Unrau",
  title =        "The {Alloc Stream Facility}: a Redesign of
                 Application-Level Stream {I/O}",
  journal =      j-COMPUTER,
  volume =       "27",
  number =       "3",
  pages =        "75--82",
  month =        mar,
  year =         "1994",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Mon Feb 3 07:28:57 MST 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Many stdio and even Unix I/O applications run faster
                 when linked to the ASF application-level library. Using
                 the Alloc Stream Interface improves performance even
                 more.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. and Comput. Eng., Toronto Univ.,
                 Ont., Canada",
  affiliationaddress = "Toronto, Can",
  classification = "723; C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6150J (Operating systems)",
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
  journalabr =   "Computer",
  keywords =     "Alloc Stream Facility; Alloc stream interface;
                 Application-level I/O facility; Application-level
                 library; Application-level stream I/O; ASF; C stdio
                 library; C++ stream I/O; Computer operating systems;
                 Concurrency; I/O-intensive applications; Input output
                 programs; Mapped files; Multithreaded applications;
                 Object-oriented structure; Parallel applications;
                 Parallel systems; Performance improvements; Popular I/O
                 interfaces; Sequential byte stream; Standard Unix
                 systems; Stdio; System behavior; UNIX",
  thesaurus =    "Input-output programs; Object-oriented methods;
                 Parallel programming; Unix",
}

@Article{Laudon:1994:IMT,
  author =       "James Laudon and Anoop Gupta and Mark Horowitz",
  title =        "Interleaving: a multithreading technique targeting
                 multiprocessors and workstations",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "11",
  pages =        "308--318",
  month =        nov,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:57 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Co-published in {\em Operating Systems Review}, {\bf
                 28}(5).",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p308-laudon/",
  abstract =     "There is an increasing trend to use commodity
                 microprocessors as the compute engines in large-scale
                 multiprocessors. However, given that the majority of
                 the microprocessors are sold in the workstation market,
                 not in the multiprocessor market, it is only natural
                 that architectural features that benefit only
                 multiprocessors are less likely to be adopted in
                 commodity microprocessors. In this paper, we explore
                 multiple-context processors, an architectural technique
                 proposed to hide the large memory latency in
                 multiprocessors. We show that while current
                 multiple-context designs work reasonably well for
                 multiprocessors, they are ineffective in hiding the
                 much shorter uniprocessor latencies using the limited
                 parallelism found in workstation environments. We
                 propose an alternative design that combines the best
                 features of two existing approaches, and present
                 simulation results that show it yields better
                 performance for both multiprogrammed workloads on a
                 workstation and parallel applications on a
                 multiprocessor. By addressing the needs of the
                 workstation environment, our proposal makes multiple
                 contexts more attractive for commodity
                 microprocessors.",
  acknowledgement = ack-nhfb,
  classification = "C5430 (Microcomputers); C5440 (Multiprocessing
                 systems); C6120 (File organisation); C6150J (Operating
                 systems)",
  conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
  conftitle =    "Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI)",
  corpsource =   "Comput. Syst. Lab., Stanford Univ., CA, USA",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "architectural features; commodity microprocessors;
                 compute engines; design; interleaved storage;
                 interleaving; large memory latency; large-scale
                 multiprocessors; measurement; multiple-context designs;
                 multiple-context processors; multiprocessing systems;
                 multiprogrammed workloads; multiprogramming;
                 multithreading technique; parallel applications;
                 parallel uniprocessor latencies; performance; theory;
                 uniprocessor latencies; workstations",
  sponsororg =   "ACM; IEEE Comput. Soc",
  subject =      "{\bf C.5.3} Computer Systems Organization, COMPUTER
                 SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS.",
  treatment =    "P Practical",
}

@Article{Launchbury:1994:LFS,
  author =       "John Launchbury and Simon L. {Peyton Jones}",
  title =        "Lazy Functional State Threads",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "6",
  pages =        "24--35",
  month =        jun,
  year =         "1994",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-598-4",
  ISBN-13 =      "978-0-89791-598-4",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:51 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/178243/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/178243/p24-launchbury/",
  abstract =     "Some algorithms make critical internal use of
                 updatable state, even though their external
                 specification is purely functional. Based on earlier
                 work on monads, we present a way of securely
                 encapsulating stateful computations that manipulate
                 multiple, named, mutable objects, in the context of a
                 non-strict, purely-functional language. The security of
                 the encapsulation is assured by the type system, using
                 parametricity. Intriguingly, this parametricity
                 requires the provision of a (single) constant with a
                 rank-2 polymorphic type.",
  acknowledgement = ack-nhfb,
  annote =       "Published as part of the Proceedings of PLDI'94.",
  classification = "C4240 (Programming and algorithm theory); C6110
                 (Systems analysis and programming); C6140D (High level
                 languages)",
  conflocation = "Orlando, FL, USA; 20-24 June 1994",
  conftitle =    "ACM SIGPLAN '94 Conference on Programming Language
                 Design and Implementation (PLDI)",
  corpsource =   "Glasgow Univ., UK",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "algorithms; encapsulation; external specification;
                 functional language; functional programming; high level
                 languages; languages; lazy functional state threads;
                 monads; mutable objects; nonstrict purely-functional
                 language; parametricity; rank-2 polymorphic type;
                 security; specification; stateful computations; type
                 system; type theory; updatable state",
  sponsororg =   "ACM",
  subject =      "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Procedures, functions, and
                 subroutines. {\bf D.3.2} Software, PROGRAMMING
                 LANGUAGES, Language Classifications, Applicative
                 (functional) languages. {\bf F.3.3} Theory of
                 Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies
                 of Program Constructs, Type structure. {\bf F.4.1}
                 Theory of Computation, MATHEMATICAL LOGIC AND FORMAL
                 LANGUAGES, Mathematical Logic, Lambda calculus and
                 related systems.",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Lee:1994:DAM,
  author =       "Ben Lee and A. R. Hurson",
  title =        "Dataflow Architectures and Multithreading",
  journal =      j-COMPUTER,
  volume =       "27",
  number =       "8",
  pages =        "27--39",
  month =        aug,
  year =         "1994",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Mon Feb 3 07:28:57 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Contrary to initial expectations, implementing
                 dataflow computers has presented a. monumental
                 challenge. Now, however, multithreading offers a.
                 viable alternative for buliding hybrid architectures
                 that exploit parallelism.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Electr. and Comput. Eng., Oregon State Univ.,
                 Corvallis, OR, USA",
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems)",
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
  keywords =     "Compilers; Concurrency; Data dependencies; Dataflow
                 architectures; Dataflow machines; Functional semantics;
                 Hybrid architectures; Id; Imperative languages;
                 Multithreading; Parallel functional languages; Parallel
                 machines; Parallelism; Programmability; Semantics; Side
                 effects; SISAL; Source code; Streams and Iterations in
                 a Single Assignment Language; Syntax; Threaded Abstract
                 Machine",
  thesaurus =    "Parallel architectures; Parallel processing",
}

@Article{Liedtke:1994:SNIb,
  author =       "Jochen Liedtke",
  title =        "A short note on implementing thread exclusiveness and
                 address space locking",
  journal =      j-OPER-SYS-REV,
  volume =       "28",
  number =       "3",
  pages =        "38--42",
  month =        jul,
  year =         "1994",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@MastersThesis{Lu:1994:MPM,
  author =       "David Ta-Chang Lu",
  title =        "A multithreaded processor for massively parallel
                 architectures",
  type =         "Thesis ({M.S.})",
  school =       "University of California, Riverside",
  address =      "Riverside, CA, USA",
  pages =        "vii + 42",
  year =         "1994",
  LCCN =         "QA76.58 .L88 1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "computer algorithms; Computer algorithms; computer
                 architecture; Computer architecture; dissertations;
                 dissertations, academic -- UCR -- computer science;
                 parallel computers; Parallel computers; Parallel
                 processing (Electronic computers); parallel processing
                 (electronic computers); Science -- Dissertations;
                 University of California, Riverside. -- Dept. of
                 Computer; University of California, Riverside. Dept. of
                 Computer Science",
}

@Article{Marinescu:1994:HLC,
  author =       "Dan C. Marinescu and John R. Rice",
  title =        "On High Level Characterization of Parallelism",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "20",
  number =       "1",
  pages =        "107--113",
  month =        jan,
  year =         "1994",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1994.1011",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:53 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C5470 (Performance
                 evaluation and testing)",
  corpsource =   "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
                 IN, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "communication complexity; load balancing; massively
                 parallel; parallel architectures; parallel execution;
                 parallelism; performance analysis; performance
                 evaluation; speedup; systems; threads of control",
  treatment =    "T Theoretical or Mathematical",
}

@Book{MixSoftware:1994:UMC,
  author =       "{Mix Software, Inc}",
  title =        "Using {Multi-C}: a portable multithreaded {C}
                 programming library",
  publisher =    pub-PHPTR,
  address =      pub-PHPTR:adr,
  pages =        "vi + 257",
  year =         "1994",
  ISBN =         "0-13-606195-8",
  ISBN-13 =      "978-0-13-606195-3",
  LCCN =         "QA76.73.C15 U85 1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "System requirements for computer disk: IBM-compatible
                 PC; DOS; Mix, Borland, or Microsoft-compatible C/C++
                 compilers.",
  acknowledgement = ack-nhfb,
  annote =       "System requirements for computer disk: IBM-compatible
                 PC; DOS; Mix, Borland, or Microsoft-compatible C/C++
                 compilers.",
  keywords =     "C (computer program language); C (Computer program
                 language); Microcomputers -- Programming languages",
}

@Article{Mukherjee:1994:MII,
  author =       "Bodhisattwa Mukherjee and Greg Eisenhauer and Kaushik
                 Ghosh",
  title =        "A machine independent interface for lightweight
                 threads",
  journal =      j-OPER-SYS-REV,
  volume =       "28",
  number =       "1",
  pages =        "33--47",
  month =        jan,
  year =         "1994",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Nemawarkar:1994:PIN,
  author =       "S. S. Nemawarkar and R. Govindarajan and G. R. Gao and
                 V. K. Agarwal",
  title =        "Performance of Interconnection Network in
                 Multithreaded Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "817",
  pages =        "823--??",
  year =         "1994",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:52:14 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nikhil:1994:MII,
  author =       "Rishiyur S. Nikhil",
  title =        "A Multithreaded Implementation of {Id} using {P-RISC}
                 Graphs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "768",
  pages =        "390--??",
  year =         "1994",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon May 13 11:52:14 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Norwood:1994:SMP,
  author =       "John Norwood and Shankar Vaidyanathan",
  title =        "Symmetric Multiprocessing for {PCs}",
  journal =      j-DDJ,
  volume =       "19",
  number =       "1",
  pages =        "80, 82--85, 98--99",
  month =        jan,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:46 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Our authors focus on multithreaded application
                 development for single-processor and
                 symmetric-multiprocessor machines under Windows NT. In
                 doing so, they present Fortran interface statements for
                 the Win32 console API and a black-box solution for
                 calling 32-bit DLLs from 16-bit applications under
                 NT.",
  acknowledgement = ack-nhfb,
  classification = "C6150J (Operating systems); C6150N (Distributed
                 systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "16-Bit applications; 32-Bit DLLs; Black-box solution;
                 Fortran interface statements; Multithreaded
                 application; Single processor machines;
                 Symmetric-multiprocessor machines; Win32 console API;
                 Windows NT",
  thesaurus =    "C listings; Multiprocessing programs;
                 Multiprogramming",
}

@InProceedings{Ramsey:1994:CTB,
  author =       "Norman Ramsey",
  title =        "Correctness of trap-based breakpoint implementations",
  crossref =     "ACM:1994:CRP",
  pages =        "15--24",
  year =         "1994",
  bibdate =      "Mon May 3 12:50:22 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/174675/p15-ramsey/",
  abstract =     "It is common for debuggers to implement breakpoints by
                 a combination of planting traps and single stepping.
                 When the target program contains multiple threads of
                 execution, a debugger that is not carefully implemented
                 may miss breakpoints. This paper gives a formal model
                 of a breakpoint in a two-threaded program. The model
                 describes correct and incorrect breakpoint
                 implementations. Automatic search of the model's state
                 space shows that the correct implementation does miss a
                 breakpoint. The results apply even to debuggers like
                 dbx and gdb, which are apparently for single-threaded
                 programs; when the user evaluates an expression
                 containing function calls, the debugger executes the
                 call in the target address space, in effect creating a
                 new thread.",
  acknowledgement = ack-nhfb,
  keywords =     "languages; measurement; theory",
  subject =      "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
                 and Debugging. {\bf F.3.1} Theory of Computation,
                 LOGICS AND MEANINGS OF PROGRAMS, Specifying and
                 Verifying and Reasoning about Programs.",
}

@Article{Rodley:1994:UIC,
  author =       "John Rodley",
  title =        "{OS/2} and {UnixWare} Interprocess Communication",
  journal =      j-DDJ,
  volume =       "19",
  number =       "5",
  pages =        "78--82, 84, 107--109",
  month =        may,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:49 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Interprocess communication isn't portable between
                 IBM's OS/2 2.1 and Novell's UnixWare 1.1. But even
                 through the implementation details differ greatly, the
                 two systems do share ways of thinking about IPC. John
                 looks at IPC under OS/2 and UnixWare to see what common
                 ground exists.",
  acknowledgement = ack-nhfb,
  classification = "C6150J (Operating systems); C6150N (Distributed
                 systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "APIs; Applications programming; Functionality; IBM
                 OS/2 2.1; Implementation details; Independent
                 processes; Interprocess communication; IPC models;
                 Multitasking operating systems; Novell UnixWare 1.1;
                 Threads",
  thesaurus =    "C listings; Multiprocessing systems; Operating systems
                 [computers]; Unix",
}

@InProceedings{Shee:1994:DMA,
  author =       "Jang Chung Shee and Chao Chin Wu and Lin Wen You and
                 Cheng Chen",
  title =        "Design of a multithread architecture and its parallel
                 simulation and evaluation environment",
  crossref =     "Anonymous:1994:ICS",
  pages =        "69--76 (vol. 1)",
  year =         "1994",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst. of Comput. Sci. and Inf. Eng., Nat. Chiao Tung
                 Univ., Hsinchu, Taiwan",
  classification = "C5220P (Parallel architecture); C6115 (Programming
                 support); C6185 (Simulation techniques)",
  keywords =     "Context switch; Integrated multiprocessing simulation
                 environment; Multithread architecture; Parallel
                 simulation; Parallel simulation and evaluation
                 environment; Parallel Virtual Machine; SUN SPARC
                 workstations; Thread-related instructions",
  thesaurus =    "Digital simulation; Parallel architectures;
                 Programming environments",
}

@InProceedings{Spero:1994:MMD,
  author =       "Simon E. Spero",
  title =        "{MDMA} --- Multithreaded Daemon for Multimedia
                 Access",
  crossref =     "Anonymous:1994:PIW",
  pages =        "??--??",
  year =         "1994",
  bibdate =      "Mon Oct 23 09:15:37 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@TechReport{Squillante:1994:AMP,
  author =       "Mark S. Squillante",
  title =        "Analytic modeling of processor utilization in
                 multithreaded processor architectures",
  type =         "Research report",
  number =       "RC 19543 (84999)",
  institution =  "IBM T. J. Watson Research Center",
  address =      "Yorktown Heights, NY, USA",
  pages =        "9",
  month =        apr,
  year =         "1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper, we develop an analytic model of
                 processor utilization in multithreaded processor
                 architectures that supports both serial and parallel
                 processing of memory requests. The system is modeled as
                 a finite, continuous-time Markov chain whose solution
                 can be obtained efficiently. Although it applies more
                 generally, our modeling approach supports an important
                 class of probability distributions that can be used to
                 approximate the distributions of interest with
                 sufficient accuracy in most practical cases. This
                 results in an efficient and accurate model across a
                 wide variety of system environments.",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors",
}

@Article{Tetewsky:1994:GDR,
  author =       "Avram K. Tetewsky",
  title =        "{GUI} Development for Real-Time Applications",
  journal =      j-DDJ,
  volume =       "19",
  number =       "6",
  pages =        "28, 30, 32, 36, 38, 40--41",
  month =        jun,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:49 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Although they take radically different approaches,
                 both ControlCalc and LabView are designed for building
                 GUI-based, real-time control applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Draper (C.S.) Lab., Cambridge, MA, USA",
  classification = "C6115 (Programming support); C6130B (Graphics
                 techniques); C6180G (Graphical user interfaces); C7420
                 (Control engineering)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "386/OS-9000; 680X0/OS9; ControlCalc Version 1.78;
                 G-Windows 2.3 windowing package; GUI development;
                 LabView 3.0; Multipage-spreadsheet paradigm;
                 Multithreaded program; National Instruments; OS-9000
                 1.3; PC-based tools; Rapid prototyping; Real-time
                 control application; RTWare; Windows data-flow driven
                 software",
  thesaurus =    "Computerised control; Graphical user interfaces;
                 Real-time systems; Software tools",
}

@Article{Thekkath:1994:EMH,
  author =       "Radhika Thekkath and Susan J. Eggers",
  title =        "The effectiveness of multiple hardware contexts",
  journal =      j-SIGPLAN,
  volume =       "29",
  number =       "11",
  pages =        "328--337",
  month =        nov,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:57 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p328-thekkath/",
  abstract =     "Multithreaded processors are used to tolerate long
                 memory latencies. By executing threads loaded in
                 multiple hardware contexts, an otherwise idle processor
                 can keep busy, thus increasing its utilization.
                 However, the larger size of a multi-thread working set
                 can have a negative effect on cache conflict misses. In
                 this paper we evaluate the two phenomena together,
                 examining their combined effect on execution time. The
                 usefulness of multiple hardware contexts depends on:
                 program data locality, cache organization and degree of
                 multiprocessing. Multiple hardware contexts are most
                 effective on programs that have been optimized for data
                 locality. For these programs, execution time dropped
                 with increasing contexts, over widely varying
                 architectures. With unoptimized applications, multiple
                 contexts had limited value. The best performance was
                 seen with only two contexts, and only on uniprocessors
                 and small multiprocessors. The behavior of the
                 unoptimized applications changed more noticeably with
                 variations in cache associativity and cache hierarchy,
                 unlike the optimized programs. As a mechanism for
                 exploiting program parallelism, an additional processor
                 is clearly better than another context. However, there
                 were many configurations for which the addition of a
                 few hardware contexts brought as much or greater
                 performance than a larger multiprocessor with fewer
                 than the optimal number of contexts.",
  acknowledgement = ack-nhfb,
  classification = "C5320G (Semiconductor storage); C5440
                 (Multiprocessing systems); C6110P (Parallel
                 programming); C6120 (File organisation); C6150N
                 (Distributed systems software)",
  conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
  conftitle =    "Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI)",
  corpsource =   "Dept. of Comput. Sci. and Eng., Washington Univ.,
                 Seattle, WA, USA",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cache associativity; cache conflict misses; cache
                 hierarchy; cache organization; cache storage; data
                 locality; design; long; long memory latencies;
                 measurement; multi-thread working set; multiple
                 hardware contexts; multiprocessing; multiprocessing
                 systems; multithreaded processors; parallel
                 programming; performance; program data locality;
                 program parallelism; storage management; theory;
                 unoptimized applications",
  sponsororg =   "ACM; IEEE Comput. Soc",
  subject =      "{\bf C.5.3} Computer Systems Organization, COMPUTER
                 SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS.",
  treatment =    "P Practical",
}

@Article{Thekkath:1994:ISB,
  author =       "R. Thekkath and S. J. Eggers",
  title =        "Impact of sharing-based thread placement on
                 multithreaded architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "176--186",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@MastersThesis{Wang:1994:MAD,
  author =       "Xiaobao Wang",
  title =        "Multithreaded architecture: design and performance
                 analysis",
  volume =       "3016",
  type =         "Thesis ({M. S.})",
  school =       "Department of Electrical Engineering, University of
                 Hawaii at Manoa",
  address =      "Manoa, HI, USA",
  pages =        "59",
  year =         "1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Theses for the degree of Master of Science (University
                 of Hawaii at Manoa)",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Multiprocessors",
}

@Article{Williams:1994:NST,
  author =       "Al Williams",
  title =        "{NT-Style} Threads For {MS-DOS}",
  journal =      j-DDJ,
  volume =       "19",
  number =       "2",
  pages =        "74, 76--77",
  month =        feb,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:47 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Al uses Phar Lap's TNT 386/DOS-Extender to implement
                 NT-style threads in a DOS program that removes a
                 directory tree. Instead of recursing down the tree, the
                 program (which works with NT and TNT) processes
                 directories in parallel.",
  acknowledgement = ack-nhfb,
  classification = "C6110 (Systems analysis and programming); C6150C
                 (Compilers, interpreters and other processors); C6150J
                 (Operating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "BIOS interrupts; C library functions; Compiling; DOS;
                 Memory allocation; MS-DOS; Multiple threads;
                 Multithreading; Phar Lap; Specification; TNT
                 386/DOS-Extender; Win32 programming API; Win32-base
                 API; Windows; Windows NT",
  thesaurus =    "Interrupts; Multiprogramming; Operating systems
                 [computers]; Program compilers",
}

@Article{Williams:1994:NTM,
  author =       "Al Williams",
  title =        "{NT-Style} Threads For {MS-DOS}",
  journal =      j-DDJ,
  volume =       "19",
  number =       "2",
  pages =        "74, 76--77",
  month =        feb,
  year =         "1994",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:15:47 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Al uses Phar Lap's TNT 386/DOS-Extender to implement
                 NT-style threads in a DOS program that removes a
                 directory tree. Instead of recursing down the tree, the
                 program (which works with NT and TNT) processes
                 directories in parallel.",
  acknowledgement = ack-nhfb,
  classification = "C6110 (Systems analysis and programming); C6150C
                 (Compilers, interpreters and other processors); C6150J
                 (Operating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "BIOS interrupts; C library functions; Compiling; DOS;
                 Memory allocation; MS-DOS; Multiple threads;
                 Multithreading; Phar Lap; Specification; TNT
                 386/DOS-Extender; Win32 programming API; Win32-base
                 API; Windows; Windows NT",
  thesaurus =    "Interrupts; Multiprogramming; Operating systems
                 [computers]; Program compilers",
}

@Article{Wong:1994:SSI,
  author =       "W. F. Wong and E. Goto",
  title =        "A Simulation Study on the Interactions Between
                 Multithreaded Architectures and the Cache",
  journal =      j-INT-J-HIGH-SPEED-COMPUTING,
  volume =       "6",
  number =       "2",
  pages =        "343--??",
  year =         "1994",
  CODEN =        "IHSCEZ",
  ISSN =         "0129-0533",
  bibdate =      "Mon Feb 25 11:19:24 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 OCLC Article1st database",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Speed Computing
                 (IJHSC)",
}

@Article{Anonymous:1995:HUW,
  author =       "Anonymous",
  title =        "{HP-UX 10.0 will be unveiled this week, with newly
                 tuned kernel and I\slash {O} paths, plus a
                 multithreaded NFS implementation}",
  journal =      j-OPEN-SYSTEMS-TODAY,
  volume =       "168",
  pages =        "34--??",
  month =        feb,
  year =         "1995",
  ISSN =         "1061-0839",
  bibdate =      "Fri Jan 26 17:24:01 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Open Systems Today",
}

@Article{Anonymous:1995:HWB,
  author =       "Anonymous",
  title =        "{HP-UX 10.0 will be unveiled this week, with newly
                 tuned kernel and I\slash {O} paths, plus a
                 multithreaded NFS implementation}",
  journal =      j-OPEN-SYSTEMS-TODAY,
  volume =       "168",
  pages =        "34--??",
  month =        feb,
  year =         "1995",
  ISSN =         "1061-0839",
  bibdate =      "Fri Jan 26 17:24:01 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Open Systems Today",
}

@Article{Baker:1995:GTP,
  author =       "Mary Baker",
  title =        "Going threadbare (panel session): sense or sedition? a
                 debate on the threads abstraction",
  journal =      j-OPER-SYS-REV,
  volume =       "29",
  number =       "5",
  pages =        "227--227",
  month =        dec,
  year =         "1995",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Baker:1995:UOV,
  author =       "Henry G. Baker",
  title =        "``Use-once'' variables and linear objects: storage
                 management, reflection and multi-threading",
  journal =      j-SIGPLAN,
  volume =       "30",
  number =       "1",
  pages =        "45--52",
  month =        jan,
  year =         "1995",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:16:59 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan1990.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Banerjee:1995:PCD,
  author =       "Prithviraj Banerjee and John A. Chandy and Manish
                 Gupta and Eugene W. {Hodges IV} and John G. Holm and
                 Antonio Lain and Daniel J. Palermo and Shankar
                 Ramaswamy and Ernesto Su",
  title =        "The {Paradigm} compiler for distributed-memory
                 multicomputers",
  journal =      j-COMPUTER,
  volume =       "28",
  number =       "10",
  pages =        "37--47",
  month =        oct,
  year =         "1995",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Mon Feb 3 07:21:26 MST 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/computer1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Illinois Univ., Urbana, IL, USA",
  affiliationaddress = "Urbana-Champaign, IL, USA",
  classification = "722.3; 722.4; 723.1; 723.2; C6110P (Parallel
                 programming); C6150C (Compilers, interpreters and other
                 processors); C6150N (Distributed systems software)",
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
  journalabr =   "Computer",
  keywords =     "Address space; Automatic parallelization; Codes
                 (symbols); Computational methods; Computer hardware;
                 Computer programming; Data communication systems; Data
                 parallelism; Data partitioning; Data processing;
                 Distributed memory multicomputer; Distributed-memory
                 multicomputers; Efficient software; Explicitly managed
                 communication; Functional parallelism; Irregular
                 computations; Manually distribution; Massively parallel
                 computers; Multithreading; Paradigm compiler; Parallel
                 algorithms; Parallel processing systems; Parallel
                 programming; Program compilers; Regular computations;
                 Sequential programs; Supercomputers",
  thesaurus =    "Distributed memory systems; Parallel machines;
                 Parallel programming; Parallelising compilers; Program
                 compilers",
}

@Book{Bic:1995:ATD,
  author =       "Lubomir Bic and Guang R. Gao and Jean-Luc Gaudiot",
  title =        "Advanced topics in dataflow computing and
                 multithreading",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "x + 450",
  year =         "1995",
  ISBN =         "0-8186-6541-6, 0-8186-6540-8 (paperback)",
  ISBN-13 =      "978-0-8186-6541-7, 978-0-8186-6540-0 (paperback)",
  LCCN =         "QA76.9.A73A356 1994",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Data structures (Computer
                 science); Parallel processing (Electronic computers)",
}

@Article{Blumofe:1995:CEM,
  author =       "Robert D. Blumofe and Christopher F. Joerg and Bradley
                 C. Kuszmaul and Charles E. Leiserson and Keith H.
                 Randall and Yuli Zhou",
  title =        "{Cilk}: an efficient multithreaded runtime system",
  journal =      j-SIGPLAN,
  volume =       "30",
  number =       "8",
  pages =        "207--216",
  month =        aug,
  year =         "1995",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:08 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Cilk (pronounced `silk') is a C-based runtime system
                 for multithreaded parallel programming. In this paper,
                 we document the efficiency of the Cilk work-stealing
                 scheduler, both empirically and analytically. We show
                 that on real and synthetic applications, the `work' and
                 `critical path' of a Cilk computation can be used to
                 accurately model performance. Consequently, a Cilk
                 programmer can focus on reducing the work and critical
                 path of his computation, insulated from load balancing
                 and other runtime scheduling issues. We also prove that
                 for the class of `fully strict' (well-structured)
                 programs, the Cilk scheduler achieves space, time, and
                 communication bounds all within a constant factor of
                 optimal. The Cilk runtime system currently runs on the
                 Connection Machine CM5 massively parallel processor
                 (MPP), the Intel Paragon MPP, the Silicon Graphics
                 Power Challenge symmetric multiprocessor (SMP), and the
                 MIT Phish network of workstations. Applications written
                 in Cilk include protein folding, graphic rendering,
                 backtrack searching, and the *Socrates chess program,
                 which won third prize in the 1994 ACM International
                 Computer Chess Championship.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  classification = "C6110P (Parallel programming); C6150C (Compilers,
                 interpreters and other processors); C6150N (Distributed
                 systems software)",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "*Socrates chess program; Accurate performance
                 modelling; Backtrack searching; C-based multithreaded
                 runtime system; Cilk; Communication bounds; Connection
                 Machine CM5; Critical path; Efficiency; Fully strict
                 programs; Graphic rendering; Intel Paragon; Load
                 balancing; MIT Phish workstation network; Parallel
                 programming; Protein folding; Runtime scheduling
                 issues; Silicon Graphics Power Challenge; Space bounds;
                 Time bounds; Well-structured programs; Work-stealing
                 scheduler",
  thesaurus =    "Backtracking; Biology computing; Molecular
                 configurations; Parallel programming; Processor
                 scheduling; Program interpreters; Proteins; Rendering
                 [computer graphics]",
}

@PhdThesis{Blumofe:1995:EMP,
  author =       "Robert D. (Robert David) Blumofe",
  title =        "Executing multithreaded programs efficiently",
  type =         "Thesis ({Ph.D.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "145",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bubeck:1995:DSC,
  author =       "T. Bubeck and M. Hiller and W. Kuchlin and W.
                 Rosenstiel",
  title =        "Distributed symbolic computation with {DTS}",
  crossref =     "Ferreira:1995:PAI",
  pages =        "231--248",
  year =         "1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
                 Germany",
  classification = "C4130 (Interpolation and function approximation);
                 C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming); C6115 (Programming
                 support); C6130S (Data security); C6150N (Distributed
                 systems software)",
  keywords =     "Anonymous compute servers; Asynchronous RPC
                 abstraction; C threads interface; Cryptosystem;
                 Distributed symbolic computation; Distributed threads
                 system; DTS; Fork/join parallel programming; Highly
                 data-dependent algorithm parallelisation; Irregular
                 algorithm parallelisation; Multiprocessor workstation;
                 Multithreading; Parallel long integer multiplication;
                 Parallel multi-variate polynomial resultant
                 computation; Performance results; Programming
                 environment; PVM; Shared memory threads",
  thesaurus =    "Arithmetic; Cryptography; Distributed memory systems;
                 Multiprocessing programs; Multiprocessing systems;
                 Parallel algorithms; Parallel programming; Polynomials;
                 Programming environments; Remote procedure calls;
                 Shared memory systems; Software performance evaluation;
                 Symbol manipulation; Workstations",
}

@Article{Byrd:1995:MPA,
  author =       "G. T. Byrd and M. A. Holliday",
  title =        "Multithreaded processor architectures",
  journal =      j-IEEE-SPECTRUM,
  volume =       "32",
  number =       "8",
  pages =        "38--46",
  month =        aug,
  year =         "1995",
  CODEN =        "IEESAM",
  DOI =          "https://doi.org/10.1109/6.402166",
  ISSN =         "0018-9235 (print), 1939-9340 (electronic)",
  ISSN-L =       "0018-9235",
  bibdate =      "Thu Jan 16 07:37:23 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeespectrum1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Spectrum",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6",
  keywords =     "Application software; Computer architecture; computer
                 architecture; Delay; Hardware; High performance
                 computing; idle cycles; instruction streams; Job shop
                 scheduling; Large-scale systems; latency;
                 microprocessor chips; multiple concurrent execution
                 streams; multiprogramming; multithreaded processor
                 architectures; performance; Registers; single
                 processor; Supercomputers; time-consuming operation",
}

@Article{Caudal:1995:DEM,
  author =       "F. Caudal and B. Lecussan",
  title =        "Design and Evaluation of a Multi-Threaded Architecture
                 for Parallel Graph Reduction",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "964",
  pages =        "411--??",
  year =         "1995",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat May 11 13:45:32 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cejtin:1995:HOD,
  author =       "Henry Cejtin and Suresh Jagannathan and Richard
                 Kelsey",
  title =        "Higher-Order Distributed Objects",
  journal =      j-TOPLAS,
  volume =       "17",
  number =       "5",
  pages =        "704--739",
  month =        sep,
  year =         "1995",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jan 5 07:58:42 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/0164-0925/213986.html",
  abstract =     "We describe a distributed implementation of Scheme
                 that permits efficient transmission of higher-order
                 objects such as closures and continuations. The
                 integration of distributed communication facilities
                 within a higher-order programming language engenders a
                 number of new abstractions and paradigms for
                 distributed computing. Among these are user-specified
                 load-balancing and migration policies for threads,
                 incrementally linked distributed computations, and
                 parameterized client-server applications. To our
                 knowledge, this is the first distributed dialect of
                 Scheme (or a related language) that addresses
                 lightweight communication abstractions for higher-order
                 objects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "experimentation; languages",
  subject =      "{\bf D.1.3}: Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming, Distributed programming. {\bf
                 D.3.2}: Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Applicative languages. {\bf D.3.2}:
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Extensible languages. {\bf D.3.3}:
                 Software, PROGRAMMING LANGUAGES, Language Constructs
                 and Features, Concurrent programming structures. {\bf
                 D.3.2}: Software, PROGRAMMING LANGUAGES, Language
                 Classifications, SCHEME.",
}

@Article{Chang:1995:CSM,
  author =       "C.-Y. Chang and J.-P. Sheu",
  title =        "Compile-time scheduling of multithread with data
                 localities on multiple vector processors",
  journal =      j-CPE,
  volume =       "7",
  number =       "5",
  pages =        "349--369",
  month =        aug,
  year =         "1995",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 05:40:19 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Chang:1995:CTS,
  author =       "C.-Y. Chang and J.-P. Sheu",
  title =        "Compile-time scheduling of multithread with data
                 localities on multiple vector processors",
  journal =      j-CPE,
  volume =       "7",
  number =       "5",
  pages =        "349--369",
  month =        aug,
  year =         "1995",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 05:40:19 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Chong:1995:PAF,
  author =       "Yong-Kim Chong and Kai Hwang",
  title =        "Performance Analysis of Four Memory Consistency Models
                 for Multithreaded Multiprocessors",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "6",
  number =       "10",
  pages =        "1085--1099",
  month =        oct,
  year =         "1995",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.473517",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Nov 6 12:31:15 MST 1998",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.computer.org/tpds/td1995/l1085abs.htm",
  acknowledgement = ack-nhfb,
  affiliation =  "Nanyang Technological Univ",
  affiliationaddress = "Singapore, Singapore",
  classification = "716.1; 722.1; 722.3; 722.4; 921.4; 922.1; C1160
                 (Combinatorial mathematics); C5440 (Multiprocessing
                 systems); C5470 (Performance evaluation and testing)",
  corpsource =   "Sch. of Electr. and Electron. Eng., Nanyang Technol.
                 Univ., Singapore",
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
  journalabr =   "IEEE Trans Parallel Distrib Syst",
  keywords =     "attributes; Bandwidth; Buffer storage; cache
                 interferences; Computer networks; Computer selection
                 and evaluation; Computer simulation; Context switching;
                 Data communication systems; Data storage equipment;
                 Distributed shared memory; distributed shared memory
                 models; embedded Markov chains; evaluation; Latency
                 hiding techniques; Markov processes; memory consistency
                 models; Memory consistency models; memory event
                 reordering; multiprocessing systems; Multiprocessing
                 systems; multithreaded multiprocessors; Multithreaded
                 multiprocessors; performance; Performance; performance
                 analysis; Performance evaluation; Petri net models;
                 Petri nets; Processors; rate; scalable multiprocessors;
                 Scalable multiprocessors; stochastic timed Petri nets;
                 Stochastic timed Petri nets; synchronisation;
                 synchronization; Synchronization; Telecommunication
                 traffic; write buffers",
  treatment =    "A Application; P Practical",
}

@TechReport{Chrisochoides:1995:MMDa,
  author =       "Nikos Chrisochoides",
  title =        "Multithreaded model for dynamic load balancing
                 parallel adaptive {PDE} computations",
  type =         "Technical report",
  number =       "CTC95, TR221",
  institution =  "Cornell Theory Center, Cornell University",
  address =      "Ithaca, NY, USA",
  pages =        "23",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  alttitle =     "Multi-threaded model for dynamic load balancing
                 parallel adaptive PDE computations",
}

@TechReport{Chrisochoides:1995:MMDb,
  author =       "Nikos Chrisochoides",
  title =        "Multithreaded model for dynamic load balancing
                 parallel adaptive {PDE} computations",
  type =         "{NASA} contractor report 198244; {ICASE} report
                 95-83.",
  institution =  "Institute for Computer Applications in Science and
                 Engineering NASA Langley Research Center",
  address =      "Hampton, VA, USA",
  pages =        "i + 23 + i",
  month =        nov,
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "To appear in Applied Numerical Mathematics Journal.",
  abstract =     "We present a multithreaded model for the dynamic
                 load-balancing of numerical, adaptive computations
                 required for the solution of Partial Differential
                 Equations (PDEs) on multiprocessors. Multithreading is
                 used as a means of exploring concurrency at the
                 processor level in order to tolerate synchronization
                 costs inherent to traditional (non-threaded) parallel
                 adaptive PDE solvers. Our preliminary analysis for
                 parallel, adaptive PDE solvers indicates that
                 multithreading can be used as a mechanism to mask
                 overheads required for the dynamic balancing of
                 processor workloads with computations required for the
                 actual numerical solution of the PDEs. Also,
                 multithreading can simplify the implementation of
                 dynamic load-balancing algorithms, a task that is very
                 difficult for traditional data parallel adaptive PDE
                 computations. Unfortunately, multithreading does not
                 always simplify program complexity, often makes code
                 re-usability difficult, and increases software
                 complexity.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by an Alex Nason Prize Award
                 Supported in part by the NSF, supplemented by ARPA.
                 Supported in part by the National Aeronautics and Space
                 Administration.",
  keywords =     "Differential equations, Partial; Parallel programming
                 (Computer science); Synchronization; Threads (Computer
                 programs)",
}

@Article{Coorg:1995:PNS,
  author =       "S. R. Coorg",
  title =        "Partitioning Non-Strict Functional Languages for
                 Multi-Threaded Code Generation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "983",
  pages =        "82--??",
  year =         "1995",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat May 11 13:45:32 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Divekar:1995:IMP,
  author =       "Ravindra Divekar",
  title =        "The impact of multithreading on the performance of
                 superscalar processors",
  type =         "Thesis ({M.A.})",
  number =       "2117",
  school =       "State University of New York at Binghamton, Thomas J.
                 Watson School of Engineering and Applied Science",
  address =      "Binghamton, NY, USA",
  pages =        "vi + 73",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Master's theses / State University of New York at
                 Binghamton",
  acknowledgement = ack-nhfb,
  keywords =     "Operating systems (Computers)",
}

@Article{Dorojevets:1995:MDA,
  author =       "M. N. Dorojevets and V. G. Oklobdzija",
  title =        "Multithreaded Decoupled Architecture",
  journal =      j-INT-J-HIGH-SPEED-COMPUTING,
  volume =       "7",
  number =       "3",
  pages =        "465--??",
  year =         "1995",
  CODEN =        "IHSCEZ",
  ISSN =         "0129-0533",
  bibdate =      "Mon Feb 25 11:19:23 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 OCLC Article1st database",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Speed Computing
                 (IJHSC)",
}

@Article{Drusinsky:1995:VDE,
  author =       "Doron Drusinsky",
  title =        "Visually Designing Embedded-Systems Applications",
  journal =      j-DDJ,
  volume =       "20",
  number =       "6",
  pages =        "62, 64, 66, 68, 104--106",
  month =        jun,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Doron describes how design tools that incorporate
                 object-oriented inheritance and extended state diagrams
                 (the visual counterpart of finite state machines) can
                 be used to build control systems.",
  acknowledgement = ack-nhfb,
  affiliation =  "R-Active Concepts and Co-Active Concepts, Ltd",
  classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C5140
                 (Firmware); C6110J (Object-oriented programming);
                 C6110P (Parallel programming); C6140D (High level
                 languages)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "C; C (programming language); C++ listing; Codes
                 (SYMBOLS); Computer aided software engineering;
                 Computer software; Computer systems; Concurrency;
                 Digital answering machine; Embedded systems;
                 Embedded-systems application; ESD; Extended state
                 diagram; Extended state diagrams; Finite automata;
                 Finite state diagram; Firmware; Hierarchy; Inheritance;
                 Interactive computer systems; Microcode;
                 Multithreading; Object oriented programming;
                 Operating-system-like routine; Reactive system; Real
                 time system; State diagram; Synchronization; Systems
                 analysis; Visual synchronization; Visually designing",
  pagecount =    "4",
  thesaurus =    "C language; C listings; Firmware; Object-oriented
                 programming; Real-time systems",
}

@TechReport{Dubey:1995:SSM,
  author =       "Pradeep Dubey",
  title =        "Single-program speculative multithreading ({SPSM})
                 architecture: compiler-assisted fine-grained
                 multithreading",
  type =         "Research report",
  number =       "RC 19928 (88233)",
  institution =  "IBM T. J. Watson Research Center",
  address =      "Yorktown Heights, NY, USA",
  pages =        "25",
  day =          "6",
  month =        feb,
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Recent limit studies on instruction-level parallel
                 processing, based on non-numeric applications, have
                 reported significant performance gains from speculative
                 execution of multiple control flows. This paper
                 describes a new single-program speculative
                 multithreading (SPSM) architecture, which can be viewed
                 as an extension of any existing single-thread
                 architecture. It enables speculative fetch, decode, and
                 execution from multiple program locations
                 simultaneously. Instruction threads are generated at
                 compile-time using control dependence analysis.
                 Inter-thread data dependences are also analyzed at
                 compile-time. However, resource binding of instructions
                 is performed only at run time, to offer binary
                 compatibility across different implementations. New
                 thread generation algorithms, being prototyped in a
                 version of the TOBEY compiler, are also described. The
                 SPSM architecture includes novel fork/suspend
                 instructions which are used to identify independent
                 instruction threads, and also to specify compile-time
                 control flow speculations associated with inter-thread
                 dependences.",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture",
}

@Article{Dugger:1995:MC,
  author =       "Jim Dugger",
  title =        "Multithreading in {C++}",
  journal =      j-CCCUJ,
  volume =       "13",
  number =       "11",
  pages =        "23--??",
  month =        nov,
  year =         "1995",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Fri Aug 30 16:52:23 MDT 1996",
  bibsource =    "http://www.cuj.com/cbklist.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@InProceedings{Elmasri:1995:TCL,
  author =       "N. Elmasri and H. H. J. Hum and G. R. Gao",
  title =        "The Threaded Communication Library: Preliminary
                 Experiences on a Multiprocessor with Dual-Processor
                 Nodes",
  crossref =     "ACM:1995:CPI",
  pages =        "195--199",
  year =         "1995",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{English:1995:MC,
  author =       "John English",
  title =        "Multithreading in {C++}",
  journal =      j-SIGPLAN,
  volume =       "30",
  number =       "4",
  pages =        "21--28",
  month =        apr,
  year =         "1995",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:03 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Manual{Fahringer:1995:UTDa,
  author =       "Thomas Fahringer and Matthew Haines and Piyush
                 Mehrotra",
  title =        "On the utility of threads for data parallel
                 programming",
  number =       "198155",
  publisher =    pub-NTIS,
  address =      pub-NTIS:adr,
  pages =        "??",
  year =         "1995",
  LCCN =         "NAS 1.26:198155 Govt Pubs",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Shipping list number 96-0037-M",
  series =       "NASA contractor report",
  acknowledgement = ack-nhfb,
  keywords =     "computation; interprocessor communication; parallel
                 programming; particle in cell technique; relaxation
                 method (mathematics)",
}

@InProceedings{Fahringer:1995:UTDb,
  author =       "T. Fahringer and M. Haines and P. Mehrotra",
  title =        "On the Utility of Threads for Data Parallel
                 Programming",
  crossref =     "ACM:1995:CPI",
  pages =        "51--59",
  year =         "1995",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@InProceedings{Field:1995:PPS,
  author =       "John Field and G. Ramalingam and Frank Tip",
  title =        "Parametric program slicing",
  crossref =     "ACM:1995:CRP",
  pages =        "379--392",
  year =         "1995",
  bibdate =      "Mon May 3 12:52:30 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p379-field/",
  abstract =     "Program slicing is a technique for isolating
                 computational threads in programs. In this paper, we
                 show how to mechanically extract a family of practical
                 algorithms for computing slices directly from semantic
                 specifications. These algorithms are based on combining
                 the notion of {\em dynamic dependence tracking\/} in
                 term rewriting systems with a program representation
                 whose behavior is defined via an equational logic. Our
                 approach is distinguished by the fact that changes to
                 the behavior of the slicing algorithm can be
                 accomplished through simple changes in rewriting rules
                 that define the semantics of the program
                 representation. Thus, e.g., different notions of
                 dependence may be specified, properties of
                 language-specific datatypes can be exploited, and
                 various time, space, and precision tradeoffs may be
                 made. This flexibility enables us to generalize the
                 traditional notions of static and dynamic slices to
                 that of a {\em constrained\/} slice, where any subset
                 of the inputs of a program may be supplied.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; languages",
  subject =      "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS
                 OF PROGRAMS, Studies of Program Constructs, Program and
                 recursion schemes. {\bf F.3.3} Theory of Computation,
                 LOGICS AND MEANINGS OF PROGRAMS, Studies of Program
                 Constructs, Functional constructs. {\bf F.3.2} Theory
                 of Computation, LOGICS AND MEANINGS OF PROGRAMS,
                 Semantics of Programming Languages. {\bf F.3.1} Theory
                 of Computation, LOGICS AND MEANINGS OF PROGRAMS,
                 Specifying and Verifying and Reasoning about Programs,
                 Specification techniques. {\bf F.4.2} Theory of
                 Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES,
                 Grammars and Other Rewriting Systems. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, C.",
}

@Article{Finger:1995:LTC,
  author =       "Jonathan Finger",
  title =        "Lightweight Tasks in {C}",
  journal =      j-DDJ,
  volume =       "20",
  number =       "5",
  pages =        "48, 50, 102",
  month =        may,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 03 09:16:50 1996",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "While most modern operating systems allow multiple
                 threads within a process, earlier-generation systems do
                 not. Jonathan presents a multithreading package that
                 allows for cooperatively multitasked threads within a
                 single process for operating systems that do not
                 explicitly support threads.",
  acknowledgement = ack-nhfb,
  classification = "722.4; 723.1; 723.1.1; C6110B (Software engineering
                 techniques); C6150J (Operating systems)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "C; C (programming language); Codes (SYMBOLS); Computer
                 operating systems; Context switch; Cooperative task
                 switching; Cooperatively multitasked threads; DOS; High
                 level language; Lightweight tasker; Lightweight tasks;
                 Microsoft compiler; Minicomputer platform; MIX
                 Software; Modern operating systems; Multi-C package;
                 Multiple processes; Multiprocessing systems;
                 Multiprogramming; Multitasking system; Multithreading
                 code; Multithreading package; Multiuser application;
                 Multiuser mailing list management system; PC/DOS
                 system; Preemptive task switching; Program compilers;
                 Software engineering; Tenberry Software; Threads;
                 Watcom compiler",
  pagecount =    "2",
  thesaurus =    "C listings; Multiprogramming; Software portability",
}

@Article{Fiske:1995:TPT,
  author =       "Stuart Fiske and William J. Dally",
  title =        "Thread prioritization: a thread scheduling mechanism
                 for multiple-context parallel processors",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "11",
  number =       "6",
  pages =        "503--518",
  month =        oct,
  year =         "1995",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Jan 10 12:00:22 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  remark =       "High-Performance Computer Architecture.",
}

@Article{Ford:1995:EDT,
  author =       "Dan Ford",
  title =        "Event-Driven Threads In {C++}",
  journal =      j-DDJ,
  volume =       "20",
  number =       "6",
  pages =        "48--50, 52, 54, 98, 100, 102",
  month =        jun,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Dan presents a powerful, multithreaded architecture
                 that can be used by almost any application. Implemented
                 in C++, this class library lets you quickly create and
                 control threads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hewlett--Packard",
  classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J
                 (Object-oriented programming); C6110P (Parallel
                 programming); C6140D (High level languages)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "C; C (programming language); C++; Computer aided
                 software engineering; Computer architecture; Computer
                 simulation; Data structures; Equivalence classes; Event
                 driven threads; Hierarchical systems; Interthread
                 communication; Message driven thread; Multithreaded;
                 Multithreaded applications; Multithreading; Object
                 oriented programming; Object oriented programming
                 application; Object-oriented infrastructure; Parallel
                 processing; Parallelism; Synchronization;
                 Synchronization strategies",
  pagecount =    "5",
  thesaurus =    "C language; C listings; Object-oriented programming;
                 Parallel programming",
}

@Article{Ford:1995:ETC,
  author =       "Dan Ford",
  title =        "Event-Driven Threads In {C++}",
  journal =      j-DDJ,
  volume =       "20",
  number =       "6",
  pages =        "48--50, 52, 54, 98, 100, 102",
  month =        jun,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Dan presents a powerful, multithreaded architecture
                 that can be used by almost any application. Implemented
                 in C++, this class library lets you quickly create and
                 control threads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hewlett--Packard",
  classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J
                 (Object-oriented programming); C6110P (Parallel
                 programming); C6140D (High level languages)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "C; C (programming language); C++; Computer aided
                 software engineering; Computer architecture; Computer
                 simulation; Data structures; Equivalence classes; Event
                 driven threads; Hierarchical systems; Interthread
                 communication; Message driven thread; Multithreaded;
                 Multithreaded applications; Multithreading; Object
                 oriented programming; Object oriented programming
                 application; Object-oriented infrastructure; Parallel
                 processing; Parallelism; Synchronization;
                 Synchronization strategies",
  pagecount =    "5",
  thesaurus =    "C language; C listings; Object-oriented programming;
                 Parallel programming",
}

@Book{Gao:1995:ATD,
  author =       "Guang R. Gao and Lubomir Bic and Jean-Luc Gaudiot",
  title =        "Advanced topics in dataflow computing and
                 multithreading",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "x + 450",
  year =         "1995",
  ISBN =         "0-8186-6541-6 (hardcover), 0-8186-6540-8 (paperback),
                 0-8186-6542-4",
  ISBN-13 =      "978-0-8186-6541-7 (hardcover), 978-0-8186-6540-0
                 (paperback), 978-0-8186-6542-4",
  LCCN =         "QA76.9.A73 A356 1995",
  bibdate =      "Sat Apr 20 11:22:41 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "computer architecture; data structures (computer
                 science); parallel processing (electronic computers)",
}

@Article{Gerber:1995:IOX,
  author =       "Bob Gerber",
  title =        "{Informix} Online {XPS}",
  journal =      j-SIGMOD,
  volume =       "24",
  number =       "2",
  pages =        "463--463",
  month =        may,
  year =         "1995",
  CODEN =        "SRECD8",
  ISSN =         "0163-5808 (print), 1943-5835 (electronic)",
  ISSN-L =       "0163-5808",
  bibdate =      "Mon Jan 12 08:45:52 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150N (Distributed
                 systems software); C6160B (Distributed databases)",
  fjournal =     "ACM SIGMOD Record",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J689",
  keywords =     "Informix Dynamic Scalable Architecture; Informix
                 Extended Parallel Server; Informix Online XPS; Large
                 SMP systems; Light access methods; Linear performance
                 speedups; Loosely coupled environments; Massively
                 parallel clusters; Online database servers; Online/DSA
                 servers; Open systems spectrum; Parallel database
                 systems; Parallel resource management; Pipelined hash
                 partitioned operators; SMP based high performance
                 parallel data query; Table partitioning; Uniprocessor
                 systems; XPS; XPS multithreaded process groups",
  thesaurus =    "Distributed databases; File servers; Parallel
                 programming; Query processing",
  xxcrossref =   "Anonymous:1995:ASI",
}

@Article{Girkar:1995:ETL,
  author =       "Milind Girkar and Constantine D. Polychronopoulos",
  title =        "Extracting Task-Level Parallelism",
  journal =      j-TOPLAS,
  volume =       "17",
  number =       "4",
  pages =        "600--634",
  month =        jul,
  year =         "1995",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jan 5 07:58:42 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/0164-0925/210189.html",
  abstract =     "Automatic detection of {\em task-level parallelism\/}
                 (also referred to as functional, DAG, unstructured, or
                 thread parallelism) at various levels of program
                 granularity is becoming increasingly important for
                 parallelizing and back-end compilers. Parallelizing
                 compilers detect iteration-level or coarser granularity
                 parallelism which is suitable for parallel computers;
                 detection of parallelism at the statement-or
                 operation-level is essential for most modern
                 microprocessors, including superscalar and VLIW
                 architectures. In this article we study the problem of
                 detecting, expressing, and optimizing task-level
                 parallelism, where ``task'' refers to a program
                 statement of arbitrary granularity. Optimizing the
                 amount of functional parallelism (by allowing
                 synchronization between arbitrary nodes) in sequential
                 programs requires the notion of {\em precedence\/} in
                 terms of paths in graphs which incorporate control and
                 data dependences. Precedences have been defined before
                 in a different context; however, the definition was
                 dependent on the ideas of parallel execution and time.
                 We show that the problem of determining precedences
                 statically is NP-complete. Determining precedence
                 relationships is useful in finding the essential data
                 dependences. We show that there exists a unique minimum
                 set of essential data dependences; finding this minimum
                 set is NP-hard and NP-easy. We also propose a heuristic
                 algorithm for finding the set of essential data
                 dependences. Static analysis of a program in the
                 Perfect Benchmarks was done, and we present some
                 experimental results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "algorithms; experimentation; languages; theory",
  subject =      "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
                 Processors, Optimization. {\bf D.3.4}: Software,
                 PROGRAMMING LANGUAGES, Processors, Compilers. {\bf
                 F.1.3}: Theory of Computation, COMPUTATION BY ABSTRACT
                 DEVICES, Complexity Classes, Reducibility and
                 completeness. {\bf D.3.4}: Software, PROGRAMMING
                 LANGUAGES, Processors, Code generation.",
}

@Article{Goossens:1995:FPM,
  author =       "B. Goossens and D. T. Vu",
  title =        "Further Pipelining and Multithreading to Improve
                 {RISC} Processor Speed. {A} Proposed Architecture and
                 Simulation Results",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "964",
  pages =        "326--??",
  year =         "1995",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat May 11 13:45:32 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Gulati:1995:MSM,
  author =       "Manu Gulati",
  title =        "Multithreading on a superscalar microprocessor",
  type =         "Thesis ({M.S., Engineering})",
  school =       "University of California, Irvine",
  address =      "Irvine, CA, USA",
  pages =        "x + 102",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Manual{Haines:1995:RSC,
  author =       "Matthew Haines and Piyush Mehrotra and David Cronk",
  title =        "Ropes, support for collective operations among
                 distributed threads",
  number =       "198157",
  publisher =    pub-NTIS,
  address =      pub-NTIS:adr,
  pages =        "??",
  year =         "1995",
  LCCN =         "NAS 1.26:198157 Govt Pubs",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Shipping list number 96-0037-M",
  series =       "NASA contractor report",
  acknowledgement = ack-nhfb,
  keywords =     "computer system design; distributed processing;
                 interprocessor communication; memory (computers);
                 numerical control; parallel programming; threads",
}

@Article{Jensen:1995:DRT,
  author =       "E. Douglas Jensen",
  title =        "Distributed real-time operating systems",
  journal =      j-DDJ,
  volume =       "20",
  number =       "2",
  pages =        "32--34, 36, 38",
  month =        feb,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 08:45:36 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6150N (Distributed systems software)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "Distributed objects; Distributed operating systems;
                 Operating systems; Real-time computing; Real-time
                 operating systems; Real-time paradigm; Threads",
  thesaurus =    "Network operating systems; Real-time systems",
}

@Article{Kavi:1995:DCM,
  author =       "Krishna M. Kavi and A. R. Hurson and Phenil Patadia
                 and Elizabeth Abraham and Ponnarasu Shanmugam",
  title =        "Design of cache memories for multi-threaded dataflow
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "253--264",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Kawamoto:1995:MTP,
  author =       "S.-I. Kawamoto and T. Ito",
  title =        "Multi-threaded {PaiLisp} with Granularity Adaptive
                 Parallel Execution",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "907",
  pages =        "94--??",
  year =         "1995",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat May 11 13:45:32 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kleiman:1995:IT,
  author =       "Steve Kleiman and Joe Eykholt",
  title =        "Interrupts as threads",
  journal =      j-OPER-SYS-REV,
  volume =       "29",
  number =       "2",
  pages =        "21--26",
  month =        apr,
  year =         "1995",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Book{Kleiman:1995:PT,
  author =       "Steve Kleiman and Devang Shah and Bart Smaalders",
  title =        "Programming With Threads",
  publisher =    pub-SUNSOFT,
  address =      pub-SUNSOFT:adr,
  pages =        "xxviii and 534",
  year =         "1995",
  ISBN =         "0-13-172389-8",
  ISBN-13 =      "978-0-13-172389-4",
  LCCN =         "QA76.58.K59 1996",
  bibdate =      "Wed Dec 09 12:51:22 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$48.00",
  URL =          "http://www.amazon.com/exec/obidos/ISBN=0131723898/sunworldonlineA/002-4892305-5599452",
  acknowledgement = ack-nhfb,
}

@Article{Lam:1995:CPC,
  author =       "Richard B. Lam",
  title =        "Cross-platform communication classes",
  journal =      j-DDJ,
  volume =       "20",
  number =       "3",
  pages =        "20, 22, 24, 26",
  month =        mar,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Sep 10 08:45:36 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Richard summarizes common techniques for interprocess
                 communication, presenting a library that implements
                 semaphores in a platform-independent manner to allow
                 signaling or controlling of shared resources between
                 processes and threads.",
  acknowledgement = ack-nhfb,
  classification = "C5620L (Local area networks); C6110J
                 (Object-oriented programming); C6140D (High level
                 languages); C6150N (Distributed systems software)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "AIX; C++ libraries; Client/server computing; Cross
                 platform C++ libraries; Cross-platform communication
                 classes; Example library; Graphical user interfaces;
                 Interprocess communications; OS/2; Semaphores; Shared
                 resources; Windows NT",
  thesaurus =    "C language; Client-server systems; Object-oriented
                 languages; Object-oriented programming; Resource
                 allocation; Software libraries",
}

@Article{Larcheveque:1995:OIP,
  author =       "J.-M. Larchev{\^e}que",
  title =        "Optimal Incremental Parsing",
  journal =      j-TOPLAS,
  volume =       "17",
  number =       "1",
  pages =        "1--15",
  month =        jan,
  year =         "1995",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jan 5 07:58:42 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/0164-0925/200996.html",
  abstract =     "This communication sets the problem of incremental
                 parsing in the context of a complete incremental
                 compiling system. It turns out that, according to the
                 incrementally paradigm of the attribute evaluator and
                 data-flow analyzer to be used, two definitions of
                 optimal incrementality in a parser are possible.
                 Algorithms for achieving both forms of optimality are
                 given, both of them based on ordinary LALR(1) parse
                 tables. Optimality and correctness proofs, which are
                 merely outlined in this communication, are made
                 intuitive thanks to the concept of a {\em well-formed
                 list of threaded trees}, a natural extension of the
                 concept of {\em threaded tree\/} found in earlier works
                 on incremental parsing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "algorithms; performance; theory",
  subject =      "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
                 Processors, Parsing. {\bf D.2.6}: Software, SOFTWARE
                 ENGINEERING, Programming Environments, Interactive.
                 {\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
                 Processors, Compilers. {\bf E.1}: Data, DATA
                 STRUCTURES, Trees.",
}

@Article{Lenatti:1995:RPM,
  author =       "C. Lenatti",
  title =        "{Rethinking in Parallel: Multiprocessing is on the
                 rise, despite a dearth of tools to help create
                 multithreaded applications}",
  journal =      j-UNIXWORLD-OPEN-COMP,
  volume =       "12",
  number =       "8",
  pages =        "57--??",
  year =         "1995",
  CODEN =        "OPCOEB",
  ISSN =         "1072-4044",
  bibdate =      "Fri Jan 26 17:24:01 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "UnixWorld's Open Computing",
}

@Article{Leppanen:1995:PWO,
  author =       "Ville Lepp{\"a}nen",
  title =        "Performance of work-optimal {PRAM} simulation
                 algorithms on coated meshes",
  journal =      j-COMP-J,
  volume =       "38",
  number =       "10",
  pages =        "801--810",
  month =        "????",
  year =         "1995",
  CODEN =        "CMPJA6",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Wed Jul 21 09:54:40 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.index.html",
  URL =          "http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.body.html#AbstractLeppanen",
  acknowledgement = ack-nhfb,
  author-1-adr = "Department of Computer Science, University of Turku,
                 Lemmink{\"a}isenkatu 14-18, Datacity, FIN-20520 Turku,
                 Finland",
  classcodes =   "C5220P (Parallel architecture); C7430 (Computer
                 engineering); C5320G (Semiconductor storage); C6110P
                 (Parallel programming); C4240C (Computational
                 complexity)",
  corpsource =   "Dept. of Comput. Sci., Turku Univ., Finland",
  email-1 =      "Ville.Leppanen@cs.utu.fi",
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  keywords =     "architectures; coated meshes; combining queues method;
                 computational complexity; cost; greedy routing; mesh
                 connected routing machinery; multithreading level;
                 parallel; parallel algorithms; random-access storage;
                 routing steps; simulated PRAM processors; simulation;
                 sorting; synchronization wave; virtual leveled network
                 technique; virtual machines; work optimal PRAM
                 simulation algorithms",
  treatment =    "P Practical",
}

@TechReport{Lim:1995:LPB,
  author =       "Beng-Hong Lim and Ricardo Bianchini",
  title =        "Limits on the performance benefits of multithreading
                 and prefetching",
  type =         "Research report",
  number =       "RC 20238 (89547)",
  institution =  "IBM T. J. Watson Research Center",
  address =      "Yorktown Heights, NY, USA",
  pages =        "23",
  day =          "20",
  month =        oct,
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by ARPA. Supported in part by NSF
                 Experimental Systems. Supported in part by a NSF
                 Presidential Young Investigator Award",
  keywords =     "Cache memory; Fault-tolerant computing;
                 Multiprocessors",
}

@MastersThesis{Loikkanen:1995:FMS,
  author =       "Matias Loikkanen",
  title =        "A fine-grain multithreading superscalar architecture",
  type =         "Thesis ({M.S., Engineering})",
  school =       "University of California, Irvine",
  address =      "Irvine, CA, USA",
  pages =        "xi + 103",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@MastersThesis{Lu:1995:HMC,
  author =       "Howard J. (Howard Jason) Lu",
  title =        "Heterogeneous multithreaded computing",
  type =         "Thesis ({M. Eng.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "21",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Maquelin:1995:CBM,
  author =       "O. C. Maquelin and H. H. J. Hum and G. R. Gao",
  title =        "Costs and Benefits of Multithreading with
                 Off-the-Shelf {RISC} Processors",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "966",
  pages =        "117--??",
  year =         "1995",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat May 11 13:45:32 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@TechReport{Marsland:1995:SSM,
  author =       "T. A. Marsland and Yaoqing Gao and Francis Chi-Moon
                 Lau",
  title =        "A study of software multithreading in distributed
                 systems",
  type =         "Technical report",
  number =       "TR 95-23",
  institution =  "Dept. of Computing Science, University of Alberta",
  address =      "Edmonton, AB, Canada",
  pages =        "25",
  year =         "1995",
  ISSN =         "0316-4683",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Mayes:1995:ULT,
  author =       "K. R. Mayes and S. Quick and B. C. Warboys",
  title =        "User-level threads on a general hardware interface",
  journal =      j-OPER-SYS-REV,
  volume =       "29",
  number =       "4",
  pages =        "57--62",
  month =        oct,
  year =         "1995",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@MastersThesis{Metz:1995:IDS,
  author =       "David Metz",
  title =        "Interface design and system impact analysis of a
                 message-handling processor for fine-grain
                 multithreading",
  type =         "Thesis ({M.S.})",
  school =       "Oregon State University",
  address =      "Corvallis, OR, USA",
  pages =        "63",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors; Parallel processing (Electronic
                 computers)",
}

@MastersThesis{Miller:1995:TPC,
  author =       "Robert C. (Robert Chisolm) Miller",
  title =        "A type-checking preprocessor for {Cilk 2}, a
                 multithreaded {C} language",
  type =         "Thesis ({M. Eng.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "38",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@PhdThesis{Moore:1995:MPD,
  author =       "Simon W. Moore",
  title =        "Multithreaded processor design",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Cambridge, Computer Laboratory",
  address =      "Cambridge, Cambridgeshire, UK",
  pages =        "xvi + 125",
  month =        feb,
  year =         "1995",
  LCCN =         "QA76.9.A73 M66 1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Available as Technical Report 358.",
  abstract =     "Multithreaded processors aim to improve upon both
                 control-flow and data-flow processor models by forming
                 some amalgam of the two. They combine sequential
                 behaviour from the control-flow model with concurrent
                 aspects from data-flow design. Some multithreaded
                 processor designs have added just a little concurrency
                 to control-flow or limited sequential execution to
                 data-flow. This thesis demonstrates that more
                 significant benefits may be obtained by a more radical
                 amalgamation of the two models. A data-driven
                 microthread model is proposed, where a microthread is a
                 short control-flow code sequence. To demonstrate the
                 efficiency of this model, a suitable multithreaded
                 processor, called Anaconda, is designed and evaluated.
                 Anaconda incorporates a scalable temporally predictable
                 memory tree structure with distributed virtual address
                 translation and memory protection. A temporally
                 predictable cached direct-mapped matching store is
                 provided to synchronise data to microthreads. Code is
                 prefetched into an instruction cache before execution
                 commences. Earliest-deadline-first or fixed-priority
                 scheduling is supported via a novel hardware priority
                 queue. Control-flow execution is performed by a
                 modified Alpha 21064 styled pipeline which assists
                 comparison with commercial processors.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by a studentship from the UK Science
                 and Engineering Research Council",
  keywords =     "Computer architecture",
}

@Article{Oikawa:1995:RDU,
  author =       "Shuichi Oikawa and Hideyuki Tokuda",
  title =        "Reflection of developing user-level real-time thread
                 packages",
  journal =      j-OPER-SYS-REV,
  volume =       "29",
  number =       "4",
  pages =        "63--76",
  month =        oct,
  year =         "1995",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Prabhakar:1995:IDO,
  author =       "Ernest N. Prabhakar",
  title =        "Implementing Distributed Objects",
  journal =      j-DDJ,
  volume =       "20",
  number =       "8",
  pages =        "80, 82, 84--85, 105--106",
  month =        aug,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Ernest uses NeXT's PDO and Objective-C to implement a
                 simple client-server application that packages a legacy
                 application into an interoperable object and its
                 client.",
  acknowledgement = ack-nhfb,
  affiliation =  "NextStep\slash OpenStep User Groups Int",
  classification = "722.1; 722.2; 722.3; 722.4; 723.1; C5620L (Local
                 area networks); C6110J (Object-oriented programming);
                 C6110P (Parallel programming); C6140D (High level
                 languages)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "Codes (symbols); Computer networks; Distributed
                 applications; Distributed computer systems; Distributed
                 objects; Interfaces (COMPUTER); Interoperable object;
                 Interoperable objects; Legacy application;
                 Multithreaded object; Network protocols; NeXT; Object
                 oriented programming; Objective-C; PDO; Portable
                 distributed objects; Program compilers; Simple client
                 server application; Software prototyping; Storage
                 allocation (computer); Table lookup",
  pagecount =    "4",
  thesaurus =    "C language; C listings; Client-server systems;
                 Object-oriented programming; Parallel programming",
}

@Article{Prasad:1995:WNT,
  author =       "Shashi Prasad",
  title =        "{Windows NT} Threads --- a multithreaded application
                 may actually run slower on an {SMP} machine than on its
                 single-threaded equivalent. {Here}'s how to avoid that",
  journal =      j-BYTE,
  volume =       "20",
  number =       "11",
  pages =        "253--??",
  month =        nov,
  year =         "1995",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Mon Aug 19 08:30:25 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "BYTE Magazine",
}

@Article{Prasad:1995:WTS,
  author =       "Shashi Prasad",
  title =        "Weaving a Thread --- {Solaris} and {Windows NT} bring
                 the power, speed, and efficiency of multithreading and
                 symmetric multiprocessing to the desktop",
  journal =      j-BYTE,
  volume =       "20",
  number =       "10",
  pages =        "173--??",
  month =        oct,
  year =         "1995",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Mon Aug 19 08:30:21 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "BYTE Magazine",
}

@Book{Reich:1995:DHP,
  author =       "David E. Reich",
  title =        "Designing high-powered {OS/2 Warp} applications: the
                 anatomy of multithreaded programs",
  publisher =    pub-WILEY,
  address =      pub-WILEY:adr,
  pages =        "xxxi + 336",
  year =         "1995",
  ISBN =         "0-471-11586-X (paperback)",
  ISBN-13 =      "978-0-471-11586-1 (paperback)",
  LCCN =         "QA76.76.O63R437 1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Application software; Microcomputers -- Operating
                 systems; Operating systems (Computers); OS/2 Warp",
}

@Article{Rodens:1995:ESC,
  author =       "Ira Rodens",
  title =        "Examining {Symantec C++} 7.0",
  journal =      j-DDJ,
  volume =       "20",
  number =       "8",
  pages =        "86--89, 106--107",
  month =        aug,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Among other features, this recent incarnation of
                 Symantec C++ sports a visual programming environment,
                 class and hierarchy editors, distributed build tools,
                 and support for templates, exceptions, and run-time
                 type identification. Compiler author Walter Bright adds
                 tips and techniques for optimizing C++ code.",
  acknowledgement = ack-nhfb,
  affiliation =  "CompuServe",
  classification = "722.2; 723.1; 723.1.1; 723.5; C6110J
                 (Object-oriented programming); C6110V (Visual
                 programming); C6115 (Programming support); C6130B
                 (Graphics techniques); C6150G (Diagnostic, testing,
                 debugging and evaluating systems); C6180G (Graphical
                 user interfaces)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "32-Bit multithreaded linker; Benchmarking; Browsers;
                 Build tasks; C (programming language); C++ language;
                 Codes (SYMBOLS); Computer programming; Distributed
                 build tools; DOS; Exceptions an; Express Agents; File
                 editors; Graphical user interfaces; Hierarchy editors;
                 LAN; Linker; Multiscope debugger; Program compilers;
                 Program debugging; Run time type identification; Run
                 time type identification programming environment;
                 Software engineering; Symantec C++ 7; Templates;
                 Upgraded Microsoft Foundation Classes; Visual
                 programming; Visual programming environment; Visual
                 tools; Windows 95 resources",
  thesaurus =    "Graphical user interfaces; Object-oriented
                 programming; Program debugging; Software reviews;
                 Software tools; Visual programming",
}

@Article{Rodley:1995:TPU,
  author =       "John Rodley",
  title =        "Thread Programming In {UnixWare} 2.0",
  journal =      j-DDJ,
  volume =       "20",
  number =       "6",
  pages =        "56, 58--61, 102, 104",
  month =        jun,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "With the advent of UnixWare 2.0, threads have made
                 their way to the UNIX desktop. John describes how
                 threads are implemented and how you can take advantage
                 of them.",
  acknowledgement = ack-nhfb,
  classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel
                 programming); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "Computer aided software engineering; Computer
                 programming; Computer simulation; Concurrency
                 programming; Fork; Lightweight processes;
                 Multiprocessing; Multiprocessing systems;
                 Multithreading; Object oriented programming; P1003.lc;
                 Parallel programming; POSIX Portable Operating Systems
                 Standard; Real time systems; Signal processing; Thread
                 programming; Thread specification; UNIX; UnixWare 2.0;
                 User interfaces",
  pagecount =    "5",
  thesaurus =    "Multiprocessing programs; Parallel programming; Unix",
}

@Article{Rogers:1995:SDD,
  author =       "Anne Rogers and Martin C. Carlisle and John H. Reppy
                 and L. J. Hendren",
  title =        "Supporting Dynamic Data Structures on
                 Distributed-Memory Machines",
  journal =      j-TOPLAS,
  volume =       "17",
  number =       "2",
  pages =        "233--263",
  month =        mar,
  year =         "1995",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jan 5 07:58:42 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/0164-0925/201065.html",
  abstract =     "Compiling for distributed-memory machines has been a
                 very active research area in recent years. Much of this
                 work has concentrated on programs that use arrays as
                 their primary data structures. To date, little work has
                 been done to address the problem of supporting programs
                 that use pointer-based dynamic data structures. The
                 techniques developed for supporting SPMD execution of
                 array-based programs rely on the fact that arrays are
                 statically defined and directly addressable. Recursive
                 data structures do not have these properties, so new
                 techniques must be developed. In this article, we
                 describe an execution model for supporting programs
                 that use pointer-based dynamic data structures. This
                 model uses a simple mechanism for migrating a thread of
                 control based on the layout of heap-allocated data and
                 introduces parallelism using a technique based on
                 futures and lazy task creation. We intend to exploit
                 this execution model using compiler analyses and
                 automatic parallelization techniques. We have
                 implemented a prototype system, which we call {\em
                 Olden}, that runs on the Intel iPSC/860 and the
                 Thinking Machines CM-5. We discuss our implementation
                 and report on experiments with five benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "experimentation; languages; measurement; performance",
  subject =      "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
                 Processors, Run-time environments. {\bf D.1.3}:
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming. {\bf D.3.4}:
                 Software, PROGRAMMING LANGUAGES, Processors, Compilers.
                 {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Data types and structures.
                 {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Dynamic storage management.",
}

@PhdThesis{Roh:1995:CGE,
  author =       "Lucas J. Roh",
  title =        "Code generations, evaluations, and optimizations in
                 multithreaded executions",
  type =         "Thesis ({Ph.D.})",
  school =       inst-CSU,
  address =      inst-CSU:adr,
  pages =        "ix + 154",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Code generators; Computer architecture; Parallel
                 processing (Electronic computers)",
}

@InProceedings{Schauser:1995:SCP,
  author =       "Klaus E. Schauser and David E. Culler and Seth C.
                 Goldstein",
  title =        "Separation constraint partitioning: a new algorithm
                 for partitioning non-strict programs into sequential
                 threads",
  crossref =     "ACM:1995:CRP",
  pages =        "259--271",
  year =         "1995",
  bibdate =      "Mon May 3 12:52:30 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p259-schauser/",
  abstract =     "In this paper we present substantially improved thread
                 partitioning algorithms for modern implicitly parallel
                 languages. We present a new block partitioning
                 algorithm, {\em separation constraint partitioning\/},
                 which is both more powerful and more flexible than
                 previous algorithms. Our algorithm is guaranteed to
                 derive maximal threads. We present a theoretical
                 framework for proving the correctness of our
                 partitioning approach, and we show how separation
                 constraint partitioning makes interprocedural
                 partitioning viable. We have implemented the
                 partitioning algorithms in an Id90 compiler for
                 workstations and parallel machines. Using this
                 experimental platform, we quantify the effectiveness of
                 different partitioning schemes on whole applications.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; experimentation; languages; theory;
                 verification",
  subject =      "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Parallel C. {\bf D.3.4} Software,
                 PROGRAMMING LANGUAGES, Processors, Compilers. {\bf
                 F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS
                 AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and
                 Problems, Computations on discrete structures. {\bf
                 F.3.3} Theory of Computation, LOGICS AND MEANINGS OF
                 PROGRAMS, Studies of Program Constructs.",
}

@MastersThesis{Shahnaz:1995:DMD,
  author =       "Munira Shahnaz",
  title =        "Design of a multithreaded data cache for a hyperscalar
                 processor",
  type =         "Thesis ({M.S.})",
  school =       "Department of Electrical Engineering, Texas A\&M
                 University",
  address =      "College Station, TX, USA",
  pages =        "xi + 80",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Major electrical engineering",
}

@PhdThesis{Shankar:1995:STI,
  author =       "Bhanu Shankar",
  title =        "The spectrum of thread implementations on hybrid
                 multithreaded architectures",
  type =         "Thesis ({Ph.D.})",
  school =       inst-CSU,
  address =      inst-CSU:adr,
  pages =        "xi + 176",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Parallel processing (Electronic
                 computers)",
}

@TechReport{Small:1995:SAB,
  author =       "Christopher Small and Margo Seltzer",
  title =        "Scheduler activations on {BSD}: sharing thread
                 management between kernel and application",
  type =         "Technical Report",
  number =       "31-95",
  institution =  "Center for Research in Computing Technology, Harvard
                 University",
  address =      "Cambridge, MA, USA",
  pages =        "12",
  year =         "1995",
  bibdate =      "Tue Sep 17 07:11:15 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Spertus:1995:ELB,
  author =       "Ellen Spertus and William J. Dally",
  title =        "Evaluating the locality benefits of active messages",
  journal =      j-SIGPLAN,
  volume =       "30",
  number =       "8",
  pages =        "189--198",
  month =        aug,
  year =         "1995",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:08 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A major challenge in fine-grained computing is
                 achieving locality without excessive scheduling
                 overhead. We built two J-Machine implementations of a
                 fine-grained programming model, the Berkeley Threaded
                 Abstract Machine. One implementation takes an active
                 messages approach, maintaining a scheduling hierarchy
                 in software in order to improve data cache performance.
                 Another approach relies on the J-Machine's message
                 queues and fast task switch, lowering the control costs
                 at the expense of data locality. Our analysis measures
                 the costs and benefits of each approach, for a variety
                 of programs and cache configurations. The active
                 messages implementation is strongest when miss
                 penalties are high and for the finest-grained programs.
                 The hardware-buffered implementation is strongest in
                 direct-mapped caches, where it achieves substantially
                 better instruction cache performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  classification = "C6110P (Parallel programming); C6120 (File
                 organisation); C6150C (Compilers, interpreters and
                 other processors); C6150N (Distributed systems
                 software)",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Active messages; Benefits; Berkeley Threaded Abstract
                 Machine; Cache configuration; Costs; Data cache
                 performance; Data locality; Direct-mapped caches; Fast
                 task switch; Fine-grained computing; Fine-grained
                 programming model; Hardware-buffered; Instruction cache
                 performance; J-Machine; Locality benefits; Message
                 queues; Miss penalties; Scheduling hierarchy;
                 Scheduling overhead",
  thesaurus =    "Cache storage; Cost-benefit analysis; Parallel
                 programming; Program compilers; Scheduling; Software
                 performance evaluation",
}

@Article{Srinivasan:1995:MMX,
  author =       "Murali V. Srinivasan",
  title =        "A Methodology for Multithreaded {X} Client
                 Development",
  journal =      j-X-RESOURCE,
  volume =       "13",
  number =       "1",
  pages =        "181--181",
  month =        jan,
  year =         "1995",
  CODEN =        "XRESEA",
  ISBN =         "1-56592-121-6",
  ISBN-13 =      "978-1-56592-121-4",
  ISSN =         "1058-5591",
  bibdate =      "Fri Mar 31 06:55:49 1995",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The X Resource",
}

@Article{Steensgaard:1995:ONC,
  author =       "B. Steensgaard and E. Jul",
  title =        "Object and native code thread mobility among
                 heterogeneous computers (includes sources)",
  journal =      j-OPER-SYS-REV,
  volume =       "29",
  number =       "5",
  pages =        "68--77",
  month =        dec,
  year =         "1995",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Stuckey:1995:FCI,
  author =       "Richard Stuckey",
  title =        "A fully conformant implementation of {ECMA-162}",
  journal =      j-ADA-USER,
  volume =       "16",
  number =       "2",
  pages =        "83--94",
  month =        jun,
  year =         "1995",
  CODEN =        "AUJOET",
  ISSN =         "0268-652X",
  bibdate =      "Mon Sep 8 18:43:50 MDT 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "ICL has developed a portable implementation of the Ada
                 interfaces to PCTE as specified by ECMA-162. The
                 interfaces map the functionality required onto that
                 provided by the C interfaces to PCTE as specified by
                 ECMA-158. The process of implementing the interfaces
                 revealed a number of errors in the ECMA PCTE standards,
                 such as errors in ECMA-162 concerning the mapping of
                 ECMA-149 onto Ada, errors in ECMA-158 such as missing
                 operations or functions with incorrect parameter modes,
                 discrepancies between the Ada and C bindings and errors
                 in ECMA-149. The architecture of the interfaces and
                 their test harness has been designed to allow easy
                 porting from one PCTE implementation to another, and
                 also from one Ada compilation system to another; some
                 major constraints were imposed by the use of the C
                 interfaces as the underlying platform, particularly
                 regarding Ada's multi-threading abilities. The
                 advantages of using the interfaces include the benefits
                 of being able to implement tools in Ada instead of C;
                 insulation from the underlying PCTE implementation; and
                 the provision of facilities (e.g. call tracing) between
                 tools and PCTE.",
  acknowledgement = ack-nhfb,
  affiliation =  "ICL Enterprises",
  affiliationaddress = "Reading, Engl",
  classification = "722.2; 723.1; 723.1.1; 723.5; 902.2; C6115
                 (Programming support); C6140D (High level languages)",
  corpsource =   "ICL Enterprises, Reading, UK",
  fjournal =     "Ada User",
  journalabr =   "Ada User J",
  keywords =     "Ada; Ada (programming language); Ada compilation
                 system; Ada interfaces; application program interfaces;
                 bindings; C (programming language); C interfaces; call
                 tracing; Codes (symbols); Computer aided software
                 engineering; ECMA PCTE standards; ECMA-149; ECMA-158;
                 ECMA-162; Errors; errors; fully conformant
                 implementation; incorrect parameter modes; missing
                 operations; multi-threading abilities; Portable Common
                 Tools Environment; portable implementation; programming
                 environments; software portability; software standards;
                 software tools; Standards; test harness; User
                 interfaces",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@Book{SunSoft:1995:SMP,
  author =       "{SunSoft}",
  title =        "{Solaris} multithreaded programming guide",
  publisher =    pub-SUNSOFT,
  address =      pub-SUNSOFT:adr,
  pages =        "xviii + 158",
  year =         "1995",
  ISBN =         "0-13-160896-7",
  ISBN-13 =      "978-0-13-160896-2",
  LCCN =         "QA76.76.O63 S635 1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors; Operating systems (Computers);
                 Solaris (Computer file); UNIX (Computer file)",
}

@Article{Tamasanis:1995:MMW,
  author =       "Doug Tamasanis",
  title =        "{Mathematica} meets {Warp}",
  journal =      j-BYTE,
  volume =       "20",
  number =       "5",
  month =        may,
  year =         "1995",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Fri May 24 09:57:14 MDT 1996",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Wolfram Research has ported Mathematica, the software
                 tool for quantitative analysis, from its Macintosh
                 origins to a wide range of platforms, including PCs,
                 Unix workstations, and several larger systems. The
                 latest port of Mathematica 2.2 is to OS/2 Warp. Now
                 OS/2 users do not have to rely on the Windows version
                 of the Mathematica kernel, which only simulates
                 multithreading. The new release takes full advantage of
                 the OS/2 preemptive scheduler, threading, and 32-bit
                 flat memory structure to both improve performance and
                 to greatly increase the size of the problems
                 Mathematica can handle. The OS/2 version is found
                 faster and more stable than the Windows version.",
  acknowledgement = ack-nhfb,
  affiliation =  "BYTE",
  classification = "722.2; 723.1; 723.1.1; 723.2; 723.5",
  fjournal =     "BYTE Magazine",
  journalabr =   "Byte",
  keywords =     "C (programming language); Command line interface;
                 Computer aided software engineering; Computer
                 architecture; Computer operating systems; Computer
                 simulation; Computer software; File editors; FORTRAN
                 (programming language); Graphical user interfaces;
                 Network protocols; Performance; Software Package
                 Mathematica; Word processing",
  pagecount =    "3",
}

@Article{Taylor:1995:CSA,
  author =       "Richard N. Taylor and Kari A. Nies and Gregory Alan
                 Bolcer and Craig A. MacFarlane and Kenneth M. Anderson
                 and Gregory F. Johnson",
  title =        "Chiron-1: a software architecture for user interface
                 development, maintenance, and run-time support",
  journal =      j-TOCHI,
  volume =       "2",
  number =       "2",
  pages =        "105--144",
  month =        jun,
  year =         "1995",
  CODEN =        "ATCIF4",
  ISSN =         "1073-0516 (print), 1557-7325 (electronic)",
  ISSN-L =       "1073-0516",
  bibdate =      "Tue Jan 19 05:49:17 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tochi/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tochi/1995-2-2/p105-taylor/",
  abstract =     "The Chiron-1 user interface system demonstrates key
                 techniques that enable a strict separation of an
                 application from its user interface. These techniques
                 include separating the control-flow aspects of the
                 application and user interface: they are concurrent and
                 may contain many threads. Chiron also separates
                 windowing and look-and-feel issues from dialogue and
                 abstract presentation decisions via mechanisms
                 employing a client-server architecture. To separate
                 application code from user interface code, user
                 interface agents called {\em artists\/} are attached to
                 instances of application abstract data types (ADTs).
                 Operations on ADTs within the application implicitly
                 trigger user interface activities within the artists.
                 Multiple artists can be attached to ADTs, providing
                 multiple views and alternative forms of access and
                 manipulation by either a single user or by multiple
                 users. Each artist and the application run in separate
                 threads of control. Artists maintain the user interface
                 by making remote calls to an abstract depiction
                 hierarchy in the Chiron server, insulting the user
                 interface code from the specifics of particular
                 windowing systems and toolkits. The Chiron server and
                 clients execute in separate processes. The
                 client-server architecture also supports multilingual
                 systems: mechanisms are demonstrated that support
                 clients written in programming languages other than
                 that of the server while nevertheless supporting
                 object-oriented server concepts. The system has been
                 used in several universities and research and
                 development projects. It is available by anonymous
                 ftp.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer-Human Interaction",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J756",
  keywords =     "design; languages",
  subject =      "{\bf H.5.2} Information Systems, INFORMATION
                 INTERFACES AND PRESENTATION, User Interfaces, User
                 interface management systems (UIMS). {\bf D.2.2}
                 Software, SOFTWARE ENGINEERING, Design Tools and
                 Techniques, User interfaces. {\bf D.2.m} Software,
                 SOFTWARE ENGINEERING, Miscellaneous, Reusable
                 software**.",
}

@PhdThesis{Thekkath:1995:DPM,
  author =       "Radhika Thekkath",
  title =        "Design and performance of multithreaded
                 architectures",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Washington",
  address =      "Seattle, WA, USA",
  pages =        "x + 100",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Multiprocessors",
}

@MastersThesis{Todiwala:1995:DRT,
  author =       "Khushroo Rustom Todiwala",
  title =        "A distributed ray tracing implementation using
                 multithreaded {RPC}",
  type =         "Thesis ({M.S.})",
  number =       "4691",
  school =       "University of Texas at El Paso",
  address =      "El Paso, TX, USA",
  pages =        "xi + 140",
  year =         "1995",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Master's thesis / University of Texas at El Paso",
  acknowledgement = ack-nhfb,
  keywords =     "Electronic data processing -- Distributed processing",
}

@TechReport{Toulouse:1995:CID,
  author =       "Michel Toulouse and Teodor Gabriel Crainic and Michel
                 Gendreau",
  title =        "Communication issues in designing cooperative
                 multi-thread parallel searches",
  type =         "Report",
  number =       "CRT-95-47",
  institution =  "Centre de recherche sur les transports, Universit{\'e}
                 de Montr{\'e}al",
  address =      "Montr{\'e}al, Qu{\'e}bec, Canada",
  year =         "1995",
  bibdate =      "Sat Apr 20 11:20:32 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Tullsen:1995:SMM,
  author =       "Dean M. Tullsen and Susan J. Eggers and Henry M.
                 Levy",
  title =        "Simultaneous multithreading: maximizing on-chip
                 parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "392--403",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "According to Hennessy and Patterson, Computer
                 Architecture, 6th edition, online appendix M
                 ``Historical Perspectives and References'', page M-36,
                 this paper's authors ``provided the first realistic
                 simulation assessment and coined the term {\em
                 simultaneous multithreading}.''",
}

@Article{vanHoff:1995:JIP,
  author =       "Arthur {van Hoff}",
  title =        "{Java} and {Internet} Programming",
  journal =      j-DDJ,
  volume =       "20",
  number =       "8",
  pages =        "56, 58, 60--61, 101--102",
  month =        aug,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  URL =          "http://www.ddj.com/ddj/issues/j508a.htm",
  abstract =     "Java, a language designed for Internet development, is
                 an object-oriented, multithreaded, portable, dynamic
                 language that's similar to C, yet simpler than C++.",
  abstract2 =    "In 1990, a new language called `Java' was developed
                 which, it turns out, addresses many of the issues of
                 software distribution on the Internet. Java is a
                 simple, object-oriented, multi-threaded,
                 garbage-collected, secure, robust,
                 architecture-neutral, portable, high-performance,
                 dynamic language. The language is similar to C and C++
                 but much simpler. Java programs are compiled into a
                 binary format that can be executed on many platforms
                 without recompilation. The language contains mechanisms
                 to verify and execute binary Java programs in a
                 controlled environment, protecting computer from
                 potential viruses and security violations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sun Microsystems",
  classification = "721.1; 722.2; 722.3; 723.1; 723.1.1; C6110J
                 (Object-oriented programming); C6140D (High level
                 languages); C6150N (Distributed systems software)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "Architecture-neutral language; Binary format; Browser;
                 Bytecodes; Bytecodes, Java language; C (programming
                 language); Codes (symbols); Compilation; Computational
                 linguistics; Computer networks; Computer programming
                 languages; Computer software portability;
                 Garbage-collection; High-performance dynamic language;
                 Interactive programs; Interfaces (computer); Internet;
                 Internet programming; Java (programming language);
                 Multithreaded language; Multithreading; Object oriented
                 programming; Object-oriented language; Portable
                 language; Program compilers; Program interpreters;
                 Robust language; Secure language; Security of data;
                 Semantics; Software distribution; Software engineering;
                 Syntax; UNIX",
  pagecount =    "4",
  thesaurus =    "Complete computer programs; Internet; Object-oriented
                 languages; Object-oriented programming; Security of
                 data; Software portability",
}

@Article{Wallach:1995:OAM,
  author =       "Deborah A. Wallach and Wilson C. Hsieh and Kirk L.
                 Johnson and M. Frans Kaashoek and William E. Weihl",
  title =        "Optimistic active messages: a mechanism for scheduling
                 communication with computation",
  journal =      j-SIGPLAN,
  volume =       "30",
  number =       "8",
  pages =        "217--226",
  month =        aug,
  year =         "1995",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:08 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Low-overhead message passing is critical to the
                 performance of many applications. Active messages (AMs)
                 reduce the software overhead for message handling:
                 messages are run as handlers instead of as threads,
                 which avoids the overhead of thread management and the
                 unnecessary data copying of other communication models.
                 Scheduling the execution of AMs is typically done by
                 disabling and enabling interrupts or by polling the
                 network. This primitive scheduling control puts severe
                 restrictions on the code that can be run in a message
                 handler. This paper describes a new software mechanism,
                 optimistic active messages (OAM), that eliminates these
                 restrictions; OAMs allow arbitrary user code to execute
                 in handlers, and also allow handlers to block. Despite
                 this gain in expressiveness, OAMs perform as well as
                 AMs. We used OAM as the base for a remote procedure
                 calling (RPC) system, Optimistic RPC (ORPC), for the
                 CM-5 multiprocessor; it consists of an optimized thread
                 package and a stub compiler that hides communication
                 details from the programmer. ORPC is 1.5 to 5 times
                 faster than traditional RPC (TRPC) for small messages
                 and performs as well as AMs. Applications that
                 primarily communicate using large data transfers or are
                 fairly coarse-grained perform equally well. For
                 applications that send many short messages, however,
                 the ORPC and AM implementations are up to 3 times
                 faster than the TRPC implementations. Using ORPC,
                 programmers obtain the benefits of well-proven
                 programming abstractions, do not have to be concerned
                 with communication details, and yet obtain nearly the
                 performance of hand-coded AM programs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  classification = "C6150N (Distributed systems software)",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Application performance; Arbitrary user code;
                 Blocking; CM-5 multiprocessor; Coarse-grained
                 applications; Communication detail hiding;
                 Communication scheduling; Computation scheduling;
                 Expressiveness; Large data transfers; Low-overhead
                 message passing; Message handlers; Optimistic active
                 messages; Optimistic remote procedure calls; Optimized
                 thread package; Programming abstractions; Software
                 overhead; Stub compiler",
  thesaurus =    "Message passing; Remote procedure calls; Scheduling",
}

@Article{Walter:1995:PMS,
  author =       "Stephen Walter",
  title =        "Put Multiprocessing Systems to Work. {II}",
  journal =      j-UNIX-REVIEW,
  volume =       "13",
  number =       "1",
  pages =        "39--??",
  month =        jan,
  year =         "1995",
  CODEN =        "UNRED5",
  ISSN =         "0742-3136",
  bibdate =      "Sat May 25 07:59:58 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover library database",
  abstract =     "Programming for multiprocessors requires use of
                 unusual features such as spin locks, mutex locks,
                 barrier synchronization, and the like. Using the POSIX
                 threads API helps, but the rest you have to do
                 yourself.",
  acknowledgement = ack-nhfb,
  fjournal =     "UNIX review",
}

@Article{Wayner:1995:FAN,
  author =       "Peter Wayner",
  title =        "Free Agents: a new generation of light-weight,
                 multithreaded operating environments provide security
                 and interoperability for agent developers",
  journal =      j-BYTE,
  volume =       "20",
  number =       "3",
  pages =        "105--??",
  month =        mar,
  year =         "1995",
  CODEN =        "BYTEDJ",
  ISSN =         "0360-5280 (print), 1082-7838 (electronic)",
  ISSN-L =       "0360-5280",
  bibdate =      "Tue Jan 2 10:01:41 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "BYTE Magazine",
}

@Article{Yam:1995:CFD,
  author =       "Michael Yam",
  title =        "A {C++} Framework for {DCE} Threads",
  journal =      j-DDJ,
  volume =       "20",
  type =         "SB",
  number =       "??",
  pages =        "27--??",
  month =        jul # "\slash " # aug,
  year =         "1995",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Mon Sep 2 09:09:39 MDT 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@InProceedings{Yasrebi:1995:EDO,
  author =       "M. Yasrebi",
  title =        "Experience with Distributed Objects in a Portable and
                 Multithreaded Library for a {LAN\slash WAN} Gateway
                 Application",
  crossref =     "IEEE:1995:PCL",
  volume =       "20",
  pages =        "164--173",
  year =         "1995",
  bibdate =      "Mon Sep 27 14:16:06 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "Also known as LCN'95. IEEE Cat no 95TB100005",
  keywords =     "computer communications; IEEE; LCN; local computer
                 networks",
}

@Article{Aitken:1996:MCJ,
  author =       "Gary Aitken",
  title =        "Moving from {C++} to {Java}",
  journal =      j-DDJ,
  volume =       "21",
  number =       "3",
  pages =        "52, 54--56",
  month =        mar,
  year =         "1996",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jan 9 09:35:43 MST 1997",
  bibsource =    "Compendex database;
                 http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover database",
  abstract =     "Java is claimed to be much easier to learn than C++,
                 but the difficulties most people have in learning to
                 program in both C++ and Java have little to do with
                 language itself. This paper explores some of the
                 differences between Java and C++. The aim is to make
                 user aware of potential problems and opportunities when
                 moving from C++ to Java. Brief explanations are
                 provided for those concepts that until now unfamiliar
                 for many users.",
  acknowledgement = ack-nhfb,
  affiliation =  "Integrated Computer Solutions",
  classification = "721.1; 722.2; 723.1; 723.1.1; 723.2",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  journalabr =   "Dr Dobb's J Software Tools Prof Program",
  keywords =     "C (programming language); Character arrays; Character
                 sets; Data structures; File organization; Garbage
                 collected language; Header files; Interfaces
                 (COMPUTER); Java; Machine code; Member function;
                 Multithreading; Object oriented programming; Pointers;
                 Program compilers; Program interpreters; Program
                 processors; Program translators; Programming theory;
                 Software engineering; Synchronization; Virtual
                 machine",
  pagecount =    "4",
}

@InProceedings{Amrhein:1996:CSM,
  author =       "Beatrice Amrhein and Oliver Gloor and Wolfgang
                 K{\"u}chlin",
  title =        "A Case Study of Multi-Threaded {Gr{\"o}bner} Basis
                 Completion",
  crossref =     "LakshmanYN:1996:IPI",
  pages =        "95--102",
  year =         "1996",
  bibdate =      "Thu Mar 12 08:43:16 MST 1998",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/issac.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/issac/236869/p95-amrhein/",
  acknowledgement = ack-nhfb,
  keywords =     "algebraic computation; algorithms; experimentation;
                 ISSAC; performance; SIGNUM; SIGSAM; symbolic
                 computation",
  subject =      "{\bf I.1.3} Computing Methodologies, SYMBOLIC AND
                 ALGEBRAIC MANIPULATION, Languages and Systems,
                 Special-purpose algebraic systems. {\bf D.1.3}
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming. {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors), Parallel
                 processors**.",
}

@MastersThesis{Annavaram:1996:BVN,
  author =       "Murali Annavaram",
  title =        "Blocking versus non-blocking: issues and tradeoffs in
                 multithreaded code execution",
  type =         "Thesis ({M.S.})",
  school =       inst-CSU,
  address =      inst-CSU:adr,
  pages =        "viii + 57",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors -- Design and construction; Parallel
                 processing (Electronic computers)",
}

@Article{Anonymous:1996:WWD,
  author =       "Anonymous",
  title =        "World-wide distributed system using {Java} and the
                 {Internet}",
  journal =      j-IEEE-INT-SYMP-HIGH-PERF-DIST-COMP-PROC,
  pages =        "11--18",
  year =         "1996",
  CODEN =        "PIDCFB",
  ISSN =         "1082-8907",
  bibdate =      "Thu Dec 12 06:31:53 MST 1996",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/java.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 96TB100069.",
  acknowledgement = ack-nhfb,
  affiliation =  "California Inst of Technology",
  affiliationaddress = "CA, USA",
  classification = "716.1; 722.4; 723; 723.1; 723.1.1",
  conference =   "Proceedings of the 1996 5th IEEE International
                 Symposium on High Performance Distributed Computing",
  fjournal =     "IEEE International Symposium on High Performance
                 Distributed Computing, Proceedings",
  keywords =     "Collaborative environments; Computer networks;
                 Computer programming languages; Computer software; Data
                 communication systems; Distributed computer systems;
                 Internet; Java; Multithreaded objects; Object oriented
                 programming; Program composition; World wide web",
  meetingaddress = "Syracuse, NY, USA",
  meetingdate =  "Aug 6--9 1996",
  meetingdate2 = "08/06--09/96",
  sponsor =      "IEEE",
}

@Article{Arnold:1996:MPJ,
  author =       "K. Arnold and J. Gosling",
  title =        "Multithreaded programming in {Java}",
  journal =      j-WEB-TECHNIQUES,
  volume =       "1",
  number =       "7",
  pages =        "34--40, 42--43",
  month =        oct,
  year =         "1996",
  CODEN =        "WETEFA",
  ISSN =         "1086-556X",
  bibdate =      "Sat Mar 15 08:49:09 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6150N (Distributed systems software); C6110J
                 (Object-oriented programming); C6140D (High level
                 languages); C6150J (Operating systems)",
  fjournal =     "Web Techniques",
  keywords =     "display; display code; dynamic behaviour; handshaking;
                 interactive program; interrupts; Java; Java object
                 oriented language; multiple; multiprogramming;
                 multithreaded programming; multithreaded system;
                 object-oriented languages; object-oriented programming;
                 operations; parallel programming; polling; problems;
                 real world software; synchronisation; threads; updates;
                 user input",
  treatment =    "P Practical",
}

@Article{Bellosa:1996:PIL,
  author =       "Frank Bellosa and Martin Steckermeier",
  title =        "The Performance Implications of Locality Information
                 Usage in Shared-Memory Multiprocessors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "113--121",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0112",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing)",
  corpsource =   "Dept. of Comput. Sci. IV, Erlangen-Nurnberg Univ.,
                 Germany",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "cache miss counters; cache storage; evaluation;
                 locality information; memory multiprocessors; parallel
                 architectures; performance; scalable shared-;
                 scheduling decisions; shared memory systems;
                 shared-memory multiprocessors; thread scheduling
                 algorithms",
  treatment =    "P Practical",
}

@InProceedings{Benson:1996:DMS,
  author =       "G. D. Benson and R. A. Olsson",
  title =        "The design of microkernel support for the {SR}
                 concurrent programming language",
  crossref =     "Szymanski:1996:LCR",
  pages =        "227--240",
  year =         "1996",
  bibdate =      "Sat Sep 28 18:12:58 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., California Univ., Davis, CA,
                 USA",
  classification = "C6110P (Parallel programming); C6140D (High level
                 languages); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  keywords =     "Distributed environment; Distributed operating system;
                 Distributed programming; Distributed programming
                 language; Mach microkernel; Message passing;
                 Microkernel; Microkernel support; Minimal kernel;
                 Multithreaded program; Networked operating system;
                 Parallel programming; SR concurrent programming
                 language",
  thesaurus =    "Distributed processing; Message passing;
                 Multiprocessing programs; Network operating systems;
                 Operating system kernels; Parallel languages",
}

@Article{Berg:1996:HDT,
  author =       "C. Berg",
  title =        "How do threads work and how can {I} create a
                 general-purpose event?",
  journal =      j-DDJ,
  volume =       "21",
  number =       "11",
  pages =        "111--115, 126--127",
  month =        nov,
  year =         "1996",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Sat Mar 15 08:49:09 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6110J (Object-oriented programming); C6140D (High
                 level languages); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  corpsource =   "Digital Focus, USA",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "(computers); application; application program
                 interfaces; applications; event; exception handling;
                 general-purpose event; Internet; Java; Java thread
                 mechanism; languages; lightweight processes;
                 multiprocessor architecture; multithreading; object;
                 object-oriented; object-oriented programming; operating
                 systems; oriented language; programming interface;
                 scheduling; synchronisation; synchronization; thread
                 programming; threads; web",
  treatment =    "P Practical",
}

@Article{Berg:1996:JQH,
  author =       "Cliff Berg",
  title =        "{Java Q and A}: How do Threads Work and How Can {I}
                 Create a General-Purpose Event?",
  journal =      j-DDJ,
  volume =       "21",
  number =       "11",
  pages =        "111--??",
  day =          "1",
  month =        nov,
  year =         "1996",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Oct 15 08:20:29 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@InProceedings{Bhandarkar:1996:MPM,
  author =       "M. A. Bhandarkar and L. V. Kale",
  title =        "{MICE}: a prototype {MPI} implementation in {Converse}
                 environment",
  crossref =     "IEEE:1996:PSM",
  pages =        "26--31",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150E (General utility programs); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
                 USA",
  keywords =     "Abstract Device Interface; application program
                 interfaces; communication; computations; Converse
                 interoperable parallel programming environment; message
                 managers; message passing; MICE; MPI modules; MPICH;
                 multi-threaded MPI programs; open systems; parallel
                 programming; programming environments; prototype MPI
                 implementation; public-domain MPI implementation; PVM
                 interoperation; thread objects; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Bianchini:1996:EPM,
  author =       "Ricardo Bianchini and Beng-Hong Lim",
  title =        "Evaluating the Performance of Multithreading and
                 Prefetching in Multiprocessors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "83--97",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0109",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C5440
                 (Multiprocessing systems); C5470 (Performance
                 evaluation and testing); C6110P (Parallel programming);
                 C6150N (Distributed systems software)",
  corpsource =   "COPPE Syst. Eng., Federal Univ. of Rio de Janeiro,
                 Brazil",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "cache; memory latency; MIT Alewife multiprocessor;
                 multiprocessing systems; multiprocessors;
                 multithreading; parallel; parallel architectures;
                 performance evaluation; programming; software
                 prefetching; storage management",
  treatment =    "P Practical",
}

@Article{Blumofe:1996:CEM,
  author =       "Robert D. Blumofe and Christopher F. Joerg and Bradley
                 C. Kuszmaul and Charles E. Leiserson and Keith H.
                 Randall and Yuli Zhou",
  title =        "{Cilk}: An Efficient Multithreaded Runtime System",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "55--69",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0107",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6110P (Parallel programming)",
  corpsource =   "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "Cilk; critical path analysis; critical-path length;
                 directed acyclic graph; load balancing; multithreaded
                 runtime system; parallel; parallel algorithms; parallel
                 programming; processor scheduling; programming; runtime
                 scheduling; synchronisation",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Bundgen:1996:SCM,
  author =       "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and
                 Wolfgang K{\"u}chlin",
  title =        "Strategy Compliant Multi-Threaded Term Completion",
  journal =      j-J-SYMBOLIC-COMP,
  volume =       "21",
  number =       "4/5/6",
  pages =        "475--506 (or 475--505??)",
  month =        apr # ", " # may # " \& " # jun,
  year =         "1996",
  CODEN =        "JSYCEH",
  ISSN =         "0747-7171 (print), 1095-855X (electronic)",
  ISSN-L =       "0747-7171",
  MRclass =      "68Q42 (68Q22 68Q40)",
  MRnumber =     "1 420 910",
  bibdate =      "Sat May 10 15:54:09 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Parallel symbolic computation.",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics computing); C5440 (Multiprocessing
                 systems); C4210L (Formal languages and computational
                 linguistics); C6130 (Data handling techniques)",
  corpsource =   "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
                 Germany",
  fjournal =     "Journal of Symbolic Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07477171",
  keywords =     "completion module AC; Knuth--Bendix completion;
                 parallel; parallel architectures; rewriting systems;
                 shared memory; strategy compliant multi-threaded term
                 completion; symbol manipulation; systems;
                 term-rewriting system PaReDuX; unfailing completion",
  treatment =    "A Application; P Practical",
}

@Article{Chrisochoides:1996:MMD,
  author =       "Nikos Chrisochoides",
  title =        "Multithreaded model for the dynamic load-balancing of
                 parallel adaptive {PDE} computations",
  journal =      j-APPL-NUM-MATH,
  volume =       "20",
  number =       "4",
  pages =        "349--365",
  day =          "3",
  month =        jun,
  year =         "1996",
  CODEN =        "ANMAEL",
  ISSN =         "0168-9274 (print), 1873-5460 (electronic)",
  ISSN-L =       "0168-9274",
  bibdate =      "Wed Jul 28 14:36:24 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/apnum/cas_free/browse/browse.cgi?year=1996&volume=20&issue=4;
                 https://www.math.utah.edu/pub/tex/bib/applnummath.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/apnum/cas_sub/browse/browse.cgi?year=1996&volume=20&issue=4&aid=652",
  acknowledgement = ack-nhfb,
  affiliation =  "Cornell Univ",
  affiliationaddress = "Ithaca, NY, USA",
  classification = "722.4; 723.1; 723.5; 731.1; 921.2; 921.6",
  fjournal =     "Applied Numerical Mathematics: Transactions of IMACS",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01689274",
  journalabr =   "Appl Numer Math",
  keywords =     "Calculations; Codes (symbols); Computational
                 complexity; Computer software; Dynamic load balancing;
                 Load balancing algorithms; Mathematical models;
                 Multicomputers; Multithreaded model; Numerical methods;
                 Parallel processing systems; Partial differential
                 equations; Processor workloads; Program complexity;
                 Program processors; Synchronization",
}

@Article{Drake:1996:IJT,
  author =       "Donald G. Drake",
  title =        "Introduction to {Java} threads",
  journal =      j-JAVAWORLD,
  volume =       "1",
  number =       "2",
  pages =        "??--??",
  month =        apr,
  year =         "1996",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-threads.htm",
  acknowledgement = ack-nhfb,
}

@Article{Eickemeyer:1996:EMU,
  author =       "Richard J. Eickemeyer and Ross E. Johnson and Steven
                 R. Kunkel and Mark S. Squillante and Shiafun Liu",
  title =        "Evaluation of multithreaded uniprocessors for
                 commercial application environments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "203--212",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Engelhardt:1996:PIP,
  author =       "Dean Engelhardt and Andrew Wendelborn",
  title =        "A Partitioning-Independent Paradigm for Nested Data
                 Parallelism",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "24",
  number =       "4",
  pages =        "291--317",
  month =        aug,
  year =         "1996",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Apr 26 11:36:49 MDT 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Adelaide",
  affiliationaddress = "Aust",
  classification = "721.1; 722.4; 723.1.1; 723.2; 723.5; C6110P
                 (Parallel programming); C6120 (File organisation);
                 C6150C (Compilers, interpreters and other processors);
                 C6150N (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Adelaide Univ., SA, Australia",
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  journalabr =   "Int J Parallel Program",
  keywords =     "abstract machine; Computational methods; Computer
                 simulation; costs; data parallel model; data
                 partitioning; Data structures; data structures; High
                 level languages; irregular data structures; Multi
                 threading; multinode execution model; Multiprocessing
                 systems; multiprocessing systems; multiprocessor
                 machines; nested data parallelism; Nested data
                 parallelism; nested data structures; nodal
                 multi-threading; one-dimensional data parallel
                 operator; parallel computation; Parallel execution
                 models; Parallel processing systems; parallel
                 programming; partitioning-independent paradigm;
                 Performance; performance statistics; program compilers;
                 software performance evaluation; Thinking machines;
                 Thinking Machines CM-5",
  treatment =    "P Practical",
}

@Article{Esposito:1996:MVB,
  author =       "Dino Esposito",
  title =        "Multithreading and {Visual Basic}",
  journal =      j-DDJ,
  volume =       "21",
  number =       "12",
  pages =        "46--??",
  month =        dec,
  year =         "1996",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Sat Mar 07 08:22:15 1998",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Although Visual Basic does not support native
                 multithreading, it does support the Windows API. This
                 means you can write VB applications composed of two or
                 more threads. Dino shows you how to create
                 multithreaded applications using both the SDK and
                 Visual Basic",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@MastersThesis{Farber:1996:EAM,
  author =       "Philipp Farber",
  title =        "Execution architecture of the multithreaded {ADAM}
                 prototype",
  type =         "Thesis ({doctoral})",
  number =       "13",
  school =       "Swiss Federal Institute of Technology",
  address =      "Zurich, Switzerland",
  pages =        "iv + 127",
  year =         "1996",
  ISBN =         "3-7281-2384-6",
  ISBN-13 =      "978-3-7281-2384-8",
  LCCN =         "????",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "TIK-Schriftenreihe",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Parallel processing (Electronic
                 computers); Parallel programming (Computer science)",
}

@InProceedings{Farcy:1996:ISP,
  author =       "A. Farcy and O. Temam",
  title =        "Improving Single-Process Performance with
                 Multithreaded Processors",
  crossref =     "ACM:1996:FCP",
  pages =        "350--357",
  year =         "1996",
  bibdate =      "Wed Mar 18 12:33:18 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "Also known as ICS'96. Held as part of the Federated
                 computing research conference (FCRC'96)",
  keywords =     "ACM; architecture; computer; FCRC; ICS; SIGARCH;
                 supercomputing",
}

@Article{Fatouron:1996:SAS,
  author =       "P. Fatouron and P. Spirakis",
  title =        "Scheduling Algorithms for Strict Multithreaded
                 Computations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1178",
  pages =        "407--??",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Aug 22 11:59:49 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Feuerstein:1996:MTP,
  author =       "E. Feuerstein and A. S. {De Loma}",
  title =        "On Multi-threaded Paging",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1178",
  pages =        "417--??",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Aug 22 11:59:49 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Foster:1996:MIW,
  author =       "I. Foster and J. Geisler and S. Tuecke",
  title =        "{MPI} on the {I-WAY}: a wide-area, multimethod
                 implementation of the {Message Passing Interface}",
  crossref =     "IEEE:1996:PSM",
  pages =        "10--17",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C5620W (Other computer networks); C6110B (Software
                 engineering techniques); C6115 (Programming support);
                 C6130S (Data security); C6150E (General utility
                 programs); C6150N (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Argonne Nat. Lab., IL, USA",
  keywords =     "application program interfaces; authentication;
                 automatic configuration mechanisms; communication
                 mechanisms; geographically distributed computing
                 resources; geographically distributed database
                 resources; geographically distributed graphics
                 resources; geographically distributed networking;
                 heterogeneous systems; high-speed wide-area networks;
                 I-WAY distributed- computing experiment; message
                 authentication; message passing; Message Passing
                 Interface; MPICH; Nexus multithreaded runtime system;
                 parallel programming; portable high-performance
                 programming model; process creation; programming
                 environments; software environment; software libraries;
                 utility programs; wide area networks",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Foster:1996:NAI,
  author =       "Ian Foster and Carl Kesselman and Steven Tuecke",
  title =        "The {Nexus} Approach to Integrating Multithreading and
                 Communication",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "70--82",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0108",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6150C (Compilers,
                 interpreters and other processors); C6150N (Distributed
                 systems software)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "asynchronous messaging; client-server systems;
                 compiler target; data communication; distributed;
                 distributed-memory systems; dynamic; dynamic
                 communication; global memory model; global pointer;
                 mechanism; memory systems; message passing;
                 multithreading; Nexus runtime system; parallel
                 languages; parallel programming; program compilers;
                 remote service request; synchronisation; thread
                 creation",
  treatment =    "P Practical",
}

@Article{Goldstein:1996:LTI,
  author =       "Seth Copen Goldstein and Klaus Erik Schauser and David
                 E. Culler",
  title =        "Lazy Threads: Implementing a Fast Parallel Call",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "5--20",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0104",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0104/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0104/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C6120 (File organisation)",
  corpsource =   "Comput. Sci. Div., California Univ., Berkeley, CA,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "code generation strategy; lazy threads; multithreaded
                 execution models; parallel call; parallel programming;
                 parallel-ready sequential call; storage management",
  treatment =    "T Theoretical or Mathematical",
}

@MastersThesis{Gollapudi:1996:MCA,
  author =       "Sreenivas Gollapudi",
  title =        "A multithreaded client-server architecture for
                 distributed multimedia systems",
  type =         "Thesis ({M.S.})",
  school =       "Dept. of Computer Science, State University of New
                 York at Buffalo",
  address =      "Buffalo, NY, USA",
  pages =        "viii + 72",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Also available as technical report 96-13.",
  acknowledgement = ack-nhfb,
  keywords =     "Electronic data processing -- Distributed processing;
                 Multimedia systems -- Design and construction;
                 Multitasking (Computer science)",
}

@Article{Grunwald:1996:WPO,
  author =       "Dirk Grunwald and Richard Neves",
  title =        "Whole-Program Optimization for Time and Space
                 Efficient Threads",
  journal =      j-SIGPLAN,
  volume =       "31",
  number =       "9",
  pages =        "50--59",
  month =        sep,
  year =         "1996",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-767-7",
  ISBN-13 =      "978-0-89791-767-4",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat May 1 15:50:57 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Co-published as SIGOPS Operating Systems Review {\bf
                 30}(5), December 1996, and as SIGARCH Computer
                 Architecture News, {\bf 24}(special issue), October
                 1996.",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p50-grunwald/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "algorithms; design; languages; performance",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Optimization. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf D.1.3} Software, PROGRAMMING
                 TECHNIQUES, Concurrent Programming, Parallel
                 programming.",
}

@Article{Hamilton:1996:JSN,
  author =       "Marc A. Hamilton",
  title =        "{Java} and the Shift to Net-Centric Computing",
  journal =      j-COMPUTER,
  volume =       "29",
  number =       "8",
  pages =        "31--39",
  month =        aug,
  year =         "1996",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Sat Mar 15 08:49:09 MST 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 UnCover library database",
  note =         "Mentions Java's use of Unicode characters.",
  abstract =     "Java, with its write once, run anywhere model, changes
                 the basic techniques by which software is designed,
                 developed, and deployed.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sun Microsystems",
  affiliationaddress = "El Segundo, CA, USA",
  classcodes =   "C6140D (High level languages); C6110J (Object-oriented
                 programming); C7210 (Information services and centres);
                 C6120 (File organisation)",
  classification = "722.1; 722.3; 723; 723.1; 723.1.1; 723.2; 723.3;
                 723.5; C6110J (Object-oriented programming); C6120
                 (File organisation); C6140D (High level languages);
                 C7210 (Information services and centres)",
  corpsource =   "Sun Microsyst., El Segundo, CA, USA",
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
  journalabr =   "Computer",
  keywords =     "application program interfaces; application
                 programming; C; C (programming language); C++; computer
                 aided software; Computer architecture; Computer
                 hardware; Computer networks; Computer operating
                 systems; Computer programming languages; Computer
                 simulation; Computer software; Computer software
                 portability; Distributed database systems; Dynamic
                 linking; engineering; environments; garbage collection;
                 interfaces; Internet; Internet, Object oriented
                 programming; interpreted language; Java; Java
                 programming language; language; management; Memory
                 management; Middleware; Middleware, Computer
                 programming languages; multithreading; Multithreading;
                 multithreading; Multithreading; multithreading; Net
                 centric computing; net-centric computing; Network
                 centric computing; Numeric data types; Object oriented
                 programming; object-; object-oriented languages;
                 object-oriented programming; oriented programming;
                 program compiler; Program compilers; program debugging;
                 Program interpreters; program testing; programming
                 environments; Security of data; software development;
                 Software engineering; software-development life cycle;
                 storage; Storage allocation (computer); Virtual
                 machines; Web browser; Web browsers; World Wide Web",
  treatment =    "P Practical",
}

@Article{Helmbold:1996:TRC,
  author =       "D. P. Helmbold and C. E. McDowell",
  title =        "A Taxonomy of Race Conditions",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "33",
  number =       "2",
  pages =        "159--164",
  day =          "15",
  month =        mar,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0034",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:59 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4230 (Switching theory); C4240P (Parallel
                 programming and algorithm theory); C6110P (Parallel
                 programming)",
  corpsource =   "Dept. of Comput. and Inf. Sci., California Univ.,
                 Santa Cruz, CA, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "access; anomalies; hazards and race conditions;
                 multiple threads; nondeterministic behavior; parallel
                 programming; race conditions taxonomy; timing",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Hertzum:1996:BQO,
  author =       "Morten Hertzum and Erik Fr{\o}kj{\ae}r",
  title =        "Browsing and querying in online documentation: a study
                 of user interfaces and the interaction process",
  journal =      j-TOCHI,
  volume =       "3",
  number =       "2",
  pages =        "136--161",
  month =        jun,
  year =         "1996",
  CODEN =        "ATCIF4",
  ISSN =         "1073-0516 (print), 1557-7325 (electronic)",
  ISSN-L =       "1073-0516",
  bibdate =      "Tue Jan 19 05:49:17 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tochi/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tochi/1996-3-2/p136-hertzum/",
  abstract =     "A user interface study concerning the usage
                 effectiveness of selected retrieval modes was conducted
                 using an experimental text retrieval system, TeSS,
                 giving access to online documentation of certain
                 programming tools. Four modes of TeSS were compared:
                 (1) browsing, (2) conventional boolean retrieval, (3)
                 boolean retrieval based on Venn diagrams, and (4) these
                 three combined. Further, the modes of TeSS were
                 compared to the use of printed manuals. The subjects
                 observed were 87 computing new to them. In the
                 experiment the use of printed manuals is faster and
                 provides answers of higher quality than any of the
                 electronic modes. Therefore, claims about the
                 effectiveness of computer-based text retrieval have to
                 by vary in situations where printed manuals are
                 manageable to the user. Among the modes of TeSS,
                 browsing is the fastest and the one causing the fewest
                 operational errors. On the same two variables, time and
                 operational errors, the Venn diagram mode performs
                 better than conventional boolean retrieval. The
                 combined mode scores worst on the objective performance
                 measures; nonetheless nearly all subject prefer this
                 mode. Concerning the interaction process, the subjects
                 tend to manage the complexities of the information
                 retrieval tasks by issuing series of simple commands
                 and exploiting the interactive capabilities of TeSS. To
                 characterize the dynamics of the interaction process
                 two concepts are introduced; threads and sequences of
                 tactics. Threads in a query sequence describes the
                 continuity during retrieval. Sequences of tactics
                 concern the combined mode and describe how different
                 retrieval modes succeed each other as the retrieval
                 process evolves.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer-Human Interaction",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J756",
  keywords =     "experimentation; human factors; performance",
  subject =      "{\bf H.5.2} Information Systems, INFORMATION
                 INTERFACES AND PRESENTATION, User Interfaces,
                 Evaluation/methodology. {\bf H.3.3} Information
                 Systems, INFORMATION STORAGE AND RETRIEVAL, Information
                 Search and Retrieval, Query formulation. {\bf H.3.3}
                 Information Systems, INFORMATION STORAGE AND RETRIEVAL,
                 Information Search and Retrieval, Retrieval models.
                 {\bf H.3.4} Information Systems, INFORMATION STORAGE
                 AND RETRIEVAL, Systems and Software. {\bf H.5.2}
                 Information Systems, INFORMATION INTERFACES AND
                 PRESENTATION, User Interfaces, Training, help, and
                 documentation.",
}

@MastersThesis{Hudson:1996:MDA,
  author =       "Greg Hudson",
  title =        "Multithreaded design in the {Athena} environment",
  type =         "Thesis ({M. Eng.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "240",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Hum:1996:SEM,
  author =       "Herbert H. J. Hum and Olivier Maquelin and Kevin B.
                 Theobald and Xinmin Tian and Guang R. Gao and Laurie J.
                 Hendren",
  title =        "A Study of the {EARTH-MANNA} Multithreaded System",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "24",
  number =       "4",
  pages =        "319--348",
  month =        aug,
  year =         "1996",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Apr 26 11:36:49 MDT 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Intel Corp",
  affiliationaddress = "OR, USA",
  classification = "722.3; 722.4; 723.5; 731.1; C5220P (Parallel
                 architecture); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Meas., Archit. and Planning, Intel Corp.,
                 Hillsboro, OR, USA",
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  journalabr =   "Int J Parallel Program",
  keywords =     "ASIC synchronization unit; benchmarks; Communication
                 latency; communication latency; Computer architecture;
                 Computer hardware; Computer simulation; Data
                 communication systems; data flow computing;
                 dataflow-like thread synchronizations; earth manna
                 system; EARTH-MANNA multithreaded system; Execution
                 unit; multiprocessing systems; Multiprocessing systems;
                 multiprocessor systems; multithreaded architecture;
                 Multithreaded system; off-the-shelf execution unit;
                 parallel architectures; Parallel processing systems;
                 performance; Performance; performance evaluation;
                 processor scheduling; Program processors; remote
                 requests; Scheduling; scheduling; sequentially-executed
                 code; synchronisation; Synchronization;
                 synchronization; Synchronization unit; uniprocessor
                 performance",
  treatment =    "P Practical",
}

@Article{Hurson:1996:CMD,
  author =       "A. R. Hurson and Krishna M. Kavi and Behrooz Shirazi
                 and Ben Lee",
  title =        "Cache Memories for Dataflow Systems",
  journal =      j-IEEE-PAR-DIST-TECH,
  volume =       "4",
  number =       "4",
  pages =        "50--64",
  month =        "Winter",
  year =         "1996",
  CODEN =        "IPDTEX",
  DOI =          "https://doi.org/10.1109/88.544436",
  ISSN =         "1063-6552 (print), 1558-1861 (electronic)",
  ISSN-L =       "1063-6552",
  bibdate =      "Mon Jun 7 07:52:29 MDT 1999",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/ieeepardisttech.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/pd/books/pd1996/pdf/p4050.pdf;
                 http://www.computer.org/concurrency/pd1996/p4050abs.htm",
  acknowledgement = ack-nhfb,
  affiliation =  "Pennsylvania State Univ",
  affiliationaddress = "PA, USA",
  classification = "721.1; 722.1; 722.2; 723; 723.1; 731.1; C5220P
                 (Parallel architecture); C5320G (Semiconductor
                 storage); C5440 (Multiprocessing systems); C6110P
                 (Parallel programming); C6120 (File organisation)",
  corpsource =   "Dept. of Comput. Sci. and Eng., Pennsylvania State
                 Univ., University Park, PA, USA",
  fjournal =     "IEEE Parallel and Distributed Technology: Systems and
                 Applications",
  journalabr =   "IEEE Parallel Distrib Technol",
  keywords =     "Algorithms; architectural model; Buffer storage; cache
                 memories; Cache misses; cache storage; Computer
                 architecture; computer architectures; Computer systems
                 programming; Context switching; control flow
                 architecture; control flow processing; dataflow
                 architectures; dataflow computation; dataflow
                 environment; dataflow processing; dataflow program;
                 dataflow programming environments; Dataflow systems;
                 dataflow systems; localities; Memory latencies;
                 Multithreading; parallel architectures; parallel
                 machines; Parallel processing systems; parallel
                 programming; Process control; Program compilers;
                 Program processors; Sequential switching; Storage
                 allocation (computer); temporal; Throughput; Virtual
                 storage",
  treatment =    "P Practical",
}

@PhdThesis{Joerg:1996:CSP,
  author =       "Christopher F. (Christopher Frank) Joerg",
  title =        "The {Cilk} system for parallel multithreaded
                 computing",
  type =         "Thesis ({Ph.D.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "199",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Karamcheti:1996:RME,
  author =       "Vijay Karamcheti and John Plevyak and Andrew A.
                 Chien",
  title =        "Runtime Mechanisms for Efficient Dynamic
                 Multithreading",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "21--40",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0105",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C6150C (Compilers,
                 interpreters and other processors)",
  corpsource =   "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "compiler; distributed memory machines; distributed
                 memory systems; dynamic multithreading; hybrid;
                 Illinois Concert runtime system; parallel; parallel
                 architectures; program compilers; programming; pull
                 messaging; stack-heap; threads",
  treatment =    "P Practical",
}

@Book{Kleiman:1996:PT,
  author =       "Steve Kleiman and Devang Shah and Bart Smaalders",
  title =        "Programming with threads",
  publisher =    pub-PH,
  address =      pub-PH:adr,
  pages =        "xxviii + 534",
  year =         "1996",
  ISBN =         "0-13-172389-8",
  ISBN-13 =      "978-0-13-172389-4",
  LCCN =         "QA76.58 .K53 1996",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "multitasking (computer science); parallel programming
                 (computer science); synchronization",
}

@Article{Leary:1996:CEH,
  author =       "S. Leary",
  title =        "{C++} exception handling in multithreaded programs",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "8",
  number =       "2",
  pages =        "20--31",
  month =        feb,
  year =         "1996",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Tue Mar 25 13:34:48 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6110J (Object-oriented programming); C6140D (High
                 level languages); C6150J (Operating systems); C6130
                 (Data handling techniques)",
  corpsource =   "Dresser-Wayne Ind., USA",
  fjournal =     "C++ Report",
  keywords =     "C language; C++; exception handling; exception-aware
                 thread class; exception-safe programming; lightweight
                 threads; multiprogramming; multitasking; multithreaded
                 programs; object oriented programming; object-;
                 object-oriented programming; operating; oriented
                 languages; OS/2; reusable C++ classes; software
                 reusability; Solaris; systems; systems (computers);
                 thread manager class; thread-safe reference counting
                 class; Windows 95; Windows NT",
  treatment =    "P Practical",
}

@Book{Lewis:1996:TPG,
  author =       "Bil Lewis and Daniel J. Berg",
  title =        "Threads Primer: a Guide to Multithreaded
                 Programming",
  publisher =    pub-SUNSOFT,
  address =      pub-SUNSOFT:adr,
  pages =        "xxvi + 319",
  year =         "1996",
  ISBN =         "0-13-443698-9",
  ISBN-13 =      "978-0-13-443698-2",
  LCCN =         "QA76.642 .L478 1996",
  bibdate =      "Fri Apr 11 17:06:46 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Sun BluePrints Program",
  acknowledgement = ack-nhfb,
  keywords =     "POSIX (Computer software standard); Threads (Computer
                 programs); UNIX (Computer file)",
}

@Article{Lim:1996:LPB,
  author =       "Beng-Hong Lim and Ricardo Bianchini",
  title =        "Limits on the performance benefits of multithreading
                 and prefetching",
  journal =      j-SIGMETRICS,
  volume =       "24",
  number =       "1",
  pages =        "37--46",
  month =        may,
  year =         "1996",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/233008.233021",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:21:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper presents new analytical models of the
                 performance benefits of multithreading and prefetching,
                 and experimental measurements of parallel applications
                 on the MIT Alewife multiprocessor. For the first time,
                 both techniques are evaluated on a real machine as
                 opposed to simulations. The models determine the region
                 in the parameter space where the techniques are most
                 effective, while the measurements determine the region
                 where the applications lie. We find that these regions
                 do not always overlap significantly. The multithreading
                 model shows that only 2-4 contexts are necessary to
                 maximize this technique's potential benefit in current
                 multiprocessors. Multithreading improves execution time
                 by less than 10\% for most of the applications that we
                 examined. The model also shows that multithreading can
                 significantly improve the performance of the same
                 applications in multiprocessors with longer latencies.
                 Reducing context-switch overhead is not crucial. The
                 software prefetching model shows that allowing 4
                 outstanding prefetches is sufficient to achieve most of
                 this technique's potential benefit on current
                 multiprocessors. Prefetching improves performance over
                 a wide range of parameters, and improves execution time
                 by as much as 20-50\% even on current multiprocessors.
                 The two models show that prefetching has a significant
                 advantage over multithreading for machines with low
                 memory latencies and/or applications with high cache
                 miss rates because a prefetch instruction consumes less
                 time than a context-switch.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Lowenthal:1996:UFG,
  author =       "David K. Lowenthal and Vincent W. Freeh and Gregory R.
                 Andrews",
  title =        "Using Fine-Grain Threads and Run-Time Decision Making
                 in Parallel Computing",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "41--54",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0106",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5220P (Parallel architecture); C6110P (Parallel
                 programming)C4240P (Parallel programming and algorithm
                 theory)",
  corpsource =   "Dept. of Comput. Sci., Arizona Univ., Tucson, AZ,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "computing; distributed shared memory;
                 distributed-memory multiprocessors; fine-grain;
                 fine-grain threads; parallel; parallel architectures;
                 parallel programming; parallelism; run-time decision
                 making",
  treatment =    "P Practical",
}

@Article{Mane:1996:SJP,
  author =       "I. Mane",
  title =        "Survey of the {Java} programming language",
  journal =      j-ELECTRONIK,
  volume =       "45",
  number =       "17",
  pages =        "84--87",
  day =          "20",
  month =        "????",
  year =         "1996",
  CODEN =        "EKRKAR",
  ISSN =         "0013-5658",
  bibdate =      "Sat Mar 15 08:49:09 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6140D (High level languages); C6150C (Compilers,
                 interpreters and other processors)",
  countrypub =   "Germany",
  fjournal =     "Elektronik",
  keywords =     "fixed; high level languages; Java programming
                 language; memory partitions; multi-threading; program
                 compilers; source code compiler",
  language =     "German",
  treatment =    "G General Review",
}

@PhdThesis{Mao:1996:PMS,
  author =       "Weihua Mao",
  title =        "Performance modeling of data prefetching and
                 multithreading in scalable multiprocessors",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Southern California",
  address =      "Los Angeles, CA, USA",
  pages =        "xi + 130",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  alttitle =     "Performance modeling of data prefetching and
                 multithreading in scalable multiprocessors",
}

@Article{McManis:1996:JDSa,
  author =       "Chuck McManis",
  title =        "{Java} In Depth: Synchronizing threads in {Java}",
  journal =      j-JAVAWORLD,
  volume =       "1",
  number =       "2",
  pages =        "??--??",
  month =        apr,
  year =         "1996",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-synch.htm",
  acknowledgement = ack-nhfb,
}

@Article{McManis:1996:JDSb,
  author =       "Chuck McManis",
  title =        "{Java} In Depth: Synchronizing threads in {Java},
                 {Part II}",
  journal =      j-JAVAWORLD,
  volume =       "1",
  number =       "3",
  pages =        "??--??",
  month =        may,
  year =         "1996",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-05-1996/jw-05-mcmanis.htm",
  acknowledgement = ack-nhfb,
}

@Article{McManis:1996:JDT,
  author =       "Chuck McManis",
  title =        "{Java} In Depth: Threads and applets and visual
                 controls",
  journal =      j-JAVAWORLD,
  volume =       "1",
  number =       "5",
  pages =        "??--??",
  month =        jul,
  year =         "1996",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-07-1996/jw-07-mcmanis.htm",
  acknowledgement = ack-nhfb,
}

@Article{Mikschl:1996:MMS,
  author =       "A. Mikschl and W. Datum",
  title =        "{MSparc}: a Multithreaded {Sparc}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1124",
  pages =        "461--??",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Oct 29 14:12:39 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Mishra:1996:TIS,
  author =       "Amitabh Mishra",
  title =        "Task and instruction scheduling in parallel
                 multithreaded processors",
  type =         "Thesis ({M.S.})",
  school =       "Department of Computer Science, Texas A\&M
                 University",
  address =      "College Station, TX, USA",
  pages =        "ix + 60",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Major computer science",
}

@Article{Mitchell:1996:JTM,
  author =       "John D. Mitchell",
  title =        "{Java} Tips: More about threads and the resize
                 problem",
  journal =      j-JAVAWORLD,
  volume =       "1",
  number =       "4",
  pages =        "??--??",
  month =        jun,
  year =         "1996",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/javatips/jw-javatip9.htm",
  acknowledgement = ack-nhfb,
}

@Book{Moore:1996:MPD,
  author =       "Simon W. (Simon William) Moore",
  title =        "Multithreaded processor design",
  volume =       "SECS 358",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xvi + 142",
  year =         "1996",
  ISBN =         "0-7923-9718-5",
  ISBN-13 =      "978-0-7923-9718-2",
  LCCN =         "QA76.5 .M574 1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "The Kluwer international series in engineering and
                 computer science",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; computer architecture;
                 Computers -- Design; multiprocessors -- design and
                 construction; Multiprocessors -- Design and
                 construction; Parallel computers; parallel computers",
}

@Book{Nichols:1996:PP,
  author =       "Bradford Nichols and Bick Buttlar and Jackie Proulx
                 Farrell",
  title =        "{Pthreads} Programming",
  publisher =    pub-ORA,
  address =      pub-ORA:adr,
  pages =        "xvi + 267",
  year =         "1996",
  ISBN =         "1-56592-115-1",
  ISBN-13 =      "978-1-56592-115-3",
  LCCN =         "QA76.642.N53 1996",
  bibdate =      "Mon May 11 11:04:53 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$29.95",
  URL =          "http://www.amazon.com/exec/obidos/ASIN/1565921151/ref=sim_books/002-4892305-5599452;
                 http://www.oreilly.com/catalog/pthread",
  acknowledgement = ack-nhfb,
}

@Book{Northrup:1996:PUT,
  author =       "Charles J. Northrup",
  title =        "Programming with {UNIX} Threads",
  publisher =    pub-WILEY,
  address =      pub-WILEY:adr,
  pages =        "xv + 399",
  year =         "1996",
  ISBN =         "0-471-13751-0 (paperback)",
  ISBN-13 =      "978-0-471-13751-1 (paperback)",
  LCCN =         "QA76.76.O63 N674 1996",
  bibdate =      "Tue May 25 07:14:38 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "operating systems (computers); UNIX (computer file)",
}

@Book{Norton:1996:TTM,
  author =       "Scott J. Norton and Mark D. DiPasquale",
  title =        "Thread time: the multithreaded programming guide",
  publisher =    pub-PH,
  address =      pub-PH:adr,
  pages =        "xx + 538",
  year =         "1996",
  ISBN =         "0-13-190067-6 (paperback)",
  ISBN-13 =      "978-0-13-190067-7 (paperback)",
  LCCN =         "QA76.642.N67 1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Hewlett--Packard professional books",
  URL =          "http://www.amazon.com/exec/obidos/ASIN/0131900676/ref=sim_books/002-4892305-5599452",
  acknowledgement = ack-nhfb,
  annote =       "System requirements: IBM compatible PC; CD-ROM
                 drive.",
  keywords =     "Parallel programming (Computer science)",
}

@Book{Pham:1996:MPW,
  author =       "Thuan Q. Pham and Pankaj K. Garg",
  title =        "Multithreaded programming with {Windows NT}",
  publisher =    pub-PHPTR,
  address =      pub-PHPTR:adr,
  pages =        "xviii + 227",
  year =         "1996",
  ISBN =         "0-13-120643-5",
  ISBN-13 =      "978-0-13-120643-4",
  LCCN =         "QA76.642 .P52 1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "One 3 1/2 in. diskette in pocket inside back cover.",
  keywords =     "Microsoft Windows NT; multiprocessors;
                 Multiprocessors; Parallel programming; parallel
                 programming (computer science); Parallel programming
                 (Computer science)",
}

@Article{Philbin:1996:TSC,
  author =       "James Philbin and Jan Edler and Otto J. Anshus and
                 Craig C. Douglas and Kai Li",
  title =        "Thread Scheduling for Cache Locality",
  journal =      j-SIGPLAN,
  volume =       "31",
  number =       "9",
  pages =        "60--71",
  month =        sep,
  year =         "1996",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-767-7",
  ISBN-13 =      "978-0-89791-767-4",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:23 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Co-published as SIGOPS Operating Systems Review {\bf
                 30}(5), December 1996, and as SIGARCH Computer
                 Architecture News, {\bf 24}(special issue), October
                 1996.",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p60-philbin/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "algorithms; experimentation; performance",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Optimization. {\bf I.1.2} Computing
                 Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION,
                 Algorithms, Algebraic algorithms. {\bf F.2.2} Theory of
                 Computation, ANALYSIS OF ALGORITHMS AND PROBLEM
                 COMPLEXITY, Nonnumerical Algorithms and Problems,
                 Sequencing and scheduling. {\bf F.2.1} Theory of
                 Computation, ANALYSIS OF ALGORITHMS AND PROBLEM
                 COMPLEXITY, Numerical Algorithms and Problems,
                 Computations on matrices. {\bf D.2.2} Software,
                 SOFTWARE ENGINEERING, Design Tools and Techniques, User
                 interfaces.",
}

@Book{Robbins:1996:PUP,
  author =       "Kay A. Robbins and Steven Robbins",
  title =        "Practical {UNIX} programming: a guide to concurrency,
                 communication, and multithreading",
  publisher =    pub-PHPTR,
  address =      pub-PHPTR:adr,
  pages =        "xiv + 658",
  year =         "1996",
  ISBN =         "0-13-443706-3",
  ISBN-13 =      "978-0-13-443706-4",
  LCCN =         "QA76.76.O63 R615 1996",
  bibdate =      "Tue May 25 07:14:38 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Microcomputers -- Programming; Operating systems; UNIX
                 (Computer file)",
}

@Article{Roh:1996:GOE,
  author =       "Lucas Roh and Walid A. Najjar and Bhanu Shankar and A.
                 P. Wim B{\"o}hm",
  title =        "Generation, Optimization, and Evaluation of
                 Multithreaded Code",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "32",
  number =       "2",
  pages =        "188--204",
  day =          "1",
  month =        feb,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0013",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:18:59 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C1180 (Optimisation techniques); C4230M
                 (Multiprocessor interconnection); C5220P (Parallel
                 architecture); C6110P (Parallel programming); C6150C
                 (Compilers, interpreters and other processors); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Colorado State Univ., Fort
                 Collins, CO, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "architectures; code generation scheme; compiler
                 intermediate; form; global bottom-up optimization
                 technique; inputs; instruction level; intrathread
                 locality; latency tolerance; multiprocessor
                 interconnection networks; multithreaded; multithreaded
                 code; multithreaded code evaluation; multithreaded code
                 generation; multithreaded computation model;
                 multithreaded synchronization; optimisation; optimising
                 compilers; parallel; parallel architectures;
                 parallelising compilers; parallelism; Pebbles;
                 processor scheduling; processor utilization; program
                 level; programming; reduced instruction set computing;
                 scalability; synchronisation; synchronization costs;
                 top-down code generation",
  treatment =    "T Theoretical or Mathematical",
}

@Article{Ruddock:1996:MPG,
  author =       "David E. Ruddock and Balakrishnan Dasarathy",
  title =        "Multithreading Programs: Guidelines for {DCE}
                 Applications",
  journal =      j-IEEE-SOFTWARE,
  volume =       "13",
  number =       "1",
  pages =        "80--90",
  month =        jan,
  year =         "1996",
  CODEN =        "IESOEG",
  ISSN =         "0740-7459 (print), 0740-7459 (electronic)",
  ISSN-L =       "0740-7459",
  bibdate =      "Sat Jan 25 07:35:26 MST 1997",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Bellcore",
  affiliationaddress = "Piscataway, NJ, USA",
  classification = "722.2; 722.4; 723.1; 723.2; 723.3",
  fjournal =     "IEEE Software",
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/software",
  journalabr =   "IEEE Software",
  keywords =     "Application programming interfaces; Client server
                 architecture; Computer aided software engineering;
                 Computer operating systems; Computer programming
                 languages; Concurrency control; Data communication
                 systems; Data structures; Distributed computer systems;
                 Distributed computing environment; Multithreading;
                 Network services; Remote procedure call; Security of
                 data; Synchronization; Telecommunication services; User
                 interfaces",
}

@InProceedings{Sah:1996:PIS,
  author =       "A. Sah and K. Brown and E. Brewer",
  title =        "Programming the {Internet} from the server-side with
                 {Tcl} and {Audience1}",
  crossref =     "USENIX:1996:ATT",
  pages =        "235--??, 183--188",
  year =         "1996",
  bibdate =      "Sat Mar 15 08:49:09 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6150N (Distributed systems software); C6115
                 (Programming support); C6110 (Systems analysis and
                 programming); C6140D (High level languages); C7230
                 (Publishing and reproduction); C7250N (Front end
                 systems for online searching)",
  conflocation = "Monterey, CA, USA; 10--13 July 1996",
  conftitle =    "Proceedings of 4th Annual Tcl/Tk Workshop '96",
  corpsource =   "Inktomi Corp., Berkeley, CA, USA",
  keywords =     "applications; Audience1; authoring languages;
                 client-server; client-server systems; client-side
                 languages; electronic; end-; extension library; HotBot
                 search engine; HotWired; Inktomi; Internet; mass
                 customization features; MTtcl; multi-threaded Tcl;
                 online front-ends; programming; publishing; server
                 languages; server-side Internet programming; software
                 libraries; to-end publishing tool; World Wide Web",
  treatment =    "P Practical",
}

@Article{Schmidt:1996:CAPa,
  author =       "D. C. Schmidt and S. Vinoski",
  title =        "Comparing alternative programming techniques for
                 multithreaded servers",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "8",
  number =       "2",
  pages =        "50--59",
  month =        feb,
  year =         "1996",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Tue Mar 25 13:34:48 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6150N (Distributed systems software); C6110J (Object-
                 oriented programming); C6160 (Database management
                 systems (DBMS)); C6140D (High level languages)",
  corpsource =   "Washington Univ., St. Louis, MO, USA",
  fjournal =     "C++ Report",
  keywords =     "applications; C; C language; C++; client-server
                 systems; CORBA; database management; desktop client;
                 financial data processing; investment brokers;
                 languages; multithreaded servers; multithreaded
                 systems; object-oriented; object-oriented programming;
                 programming; query processing; stock prices; stock
                 quote database; synchronization; systems; wrappers",
  treatment =    "P Practical",
}

@Article{Schmidt:1996:CAPb,
  author =       "D. C. Schmidt and S. Vinoski",
  title =        "Comparing alternative programming techniques for
                 multithreaded {CORBA} servers",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "8",
  number =       "4",
  pages =        "56--66",
  month =        apr,
  year =         "1996",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Tue Mar 25 13:34:48 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6110J (Object-oriented programming); C6110P (Parallel
                 programming); C6140D (High level languages)",
  corpsource =   "Washington Univ., St. Louis, MO, USA",
  fjournal =     "C++ Report",
  keywords =     "C language; complexity; distributed multithreaded
                 applications; multithreaded CORBA servers;
                 object-oriented programming; parallel; programming;
                 programming techniques",
  treatment =    "P Practical",
}

@Article{Schmidt:1996:CAPc,
  author =       "D. C. Schmidt and S. Vinoski",
  title =        "Comparing alternative programming techniques for
                 multithreaded {CORBA} servers",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "8",
  number =       "7",
  pages =        "47--56",
  month =        jul,
  year =         "1996",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Tue Mar 25 13:34:48 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6110J (Object-oriented programming); C6150N
                 (Distributed systems software); C5690 (Other data
                 communication equipment and techniques); C6110P
                 (Parallel programming)",
  corpsource =   "Washington Univ., St. Louis, MO, USA",
  fjournal =     "C++ Report",
  keywords =     "alternative programming techniques; C; C++ wrappers;
                 concurrency model; CORBA; multithreaded CORBA;
                 multithreaded stock quote servers; network servers;
                 object-oriented programming; parallel; programming;
                 servers; thread per request; thread per session model;
                 thread pool",
  treatment =    "P Practical",
}

@Article{Severance:1996:MOB,
  author =       "Charles Severance and Richard Enbody and Paul
                 Petersen",
  title =        "Managing the Overall Balance of Operating System
                 Threads on a Multiprocessor Using Automatic
                 Self-Allocating Threads ({ASAT})",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "106--112",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0111",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessing systems); C6110P (Parallel
                 programming); C6150J (Operating systems); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Michigan State Univ., East
                 Lansing, MI, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "allocating threads; allocation; automatic self-;
                 multiprocessing system; multiprocessing systems;
                 operating system; operating systems (computers);
                 parallel programming; processor scheduling; run-time
                 environment; self-scheduling; thread; thread
                 scheduling",
  treatment =    "P Practical; X Experimental",
}

@Article{Sigmund:1996:IBM,
  author =       "U. Sigmund and T. Ungerer",
  title =        "Identifying Bottlenecks in a Multithreaded Superscalar
                 Microprocessor",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1124",
  pages =        "797--??",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Oct 29 14:12:39 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Skjellum:1996:TTM,
  author =       "A. Skjellum and B. Protopopov and S. Hebert",
  title =        "A thread taxonomy for {MPI}",
  crossref =     "IEEE:1996:PSM",
  pages =        "50--57",
  year =         "1996",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110F
                 (Formal methods); C6150E (General utility programs);
                 C6150J (Operating systems); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  keywords =     "API extensions; application program interfaces;
                 Channel Device; computational unit; fine-grain
                 concurrency; formal specification; message passing;
                 minimal portable thread management; MPI; MPICH;
                 multi-threaded thread-safe ADI; non-thread-safe MPI
                 call semantics; resource container; software
                 portability; synchronisation; synchronization
                 mechanisms; thread models; thread safety; thread
                 taxonomy; user-level mechanism; utility programs;
                 Windows NT version",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Sundaresan:1996:COO,
  author =       "Neelakantan Sundaresan and Dennis Gannon",
  title =        "{Coir}: An Object-Oriented System for Control and
                 Dynamic Data Parallelism",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "37",
  number =       "1",
  pages =        "98--105",
  day =          "25",
  month =        aug,
  year =         "1996",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.0110",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:00 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4240P (Parallel programming and algorithm theory);
                 C5220P (Parallel architecture); C6110J (Object-oriented
                 programming); C6110P (Parallel programming); C6150N
                 (Distributed systems software)",
  corpsource =   "Applic. Dev. Technol. Inst., IBM Corp., San Jose, CA,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "C++ library; Coir; distributed memory machines;
                 distributed memory systems; dynamic data parallelism;
                 message passing; message-passing; multithreading;
                 object-oriented; object-oriented system; operating
                 system; parallel; parallel architectures; parallel
                 programming; programming; shared memory systems;
                 symmetric multiprocessors; synchronisation",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Article{Tullsen:1996:ECI,
  author =       "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer
                 and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm",
  title =        "Exploiting choice: instruction fetch and issue on an
                 implementable simultaneous multithreading processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "191--202",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@PhdThesis{Tullsen:1996:SM,
  author =       "Dean Michael Tullsen",
  title =        "Simultaneous multithreading",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Washington",
  address =      "Seattle, WA, USA",
  pages =        "vi + 99",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Parallel processing (Electronic
                 computers)",
}

@MastersThesis{Verriello:1996:MSM,
  author =       "Anthony Verriello",
  title =        "Memory sharing in multithreaded transaction
                 environments",
  type =         "Thesis ({M.S.})",
  school =       "Hofstra University",
  address =      "Westport, CT, USA",
  pages =        "180",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Memory, Virtual (Computer science); Transaction
                 systems (Computer systems)",
}

@Article{Vinoski:1996:DCD,
  author =       "S. Vinoski and D. C. Schmidt",
  title =        "Distributed callbacks and decoupled communication in
                 {CORBA}",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "8",
  number =       "9",
  pages =        "48--56, 77",
  month =        oct,
  year =         "1996",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Tue Mar 25 13:34:48 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classcodes =   "C6150N (Distributed systems software); C6110J (Object-
                 oriented programming)",
  corpsource =   "Hewlett--Packard's Distributed Comput. Program,
                 Chelmsford, MA, USA",
  fjournal =     "C++ Report",
  keywords =     "client-server systems; client/server; concurrency
                 control; concurrency models; consumers; CORBA;
                 decoupled communication; decoupled peer-to-peer;
                 distributed callbacks; distributed object computing
                 systems; distributed stock quoting; multithreaded;
                 object-oriented; OMG Events object service;
                 programming; relationships; request communication;
                 response communication; server applications; suppliers;
                 systems",
  treatment =    "P Practical",
}

@Article{Vlassov:1996:AMM,
  author =       "V. Vlassov and L.-E. Thorelli",
  title =        "Analytical Models of Multithreading with Data
                 Prefetching",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1124",
  pages =        "714--??",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Oct 29 14:12:39 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Wise:1996:SDP,
  author =       "David S. Wise and Joshua Walgenbach",
  title =        "Static and dynamic partitioning of pointers as links
                 and threads",
  journal =      j-SIGPLAN,
  volume =       "31",
  number =       "6",
  pages =        "42--49",
  month =        jun,
  year =         "1996",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:20 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN,
                 USA",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Wismuller:1996:IDP,
  author =       "Roland Wism{\"u}ller and Michael Oberhuber and Johann
                 Krammer and Olav Hansen",
  title =        "Interactive debugging and performance analysis of
                 massively parallel applications",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "22",
  number =       "3",
  pages =        "415--442",
  day =          "29",
  month =        apr,
  year =         "1996",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Aug 6 10:14:54 MDT 1999",
  bibsource =    "Compendex database;
                 http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1996&volume=22&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&volume=22&issue=3&aid=1049",
  acknowledgement = ack-nhfb,
  affiliation =  "Inst f{\"u}r Informatik der Technischen
                 Universit{\"a}t M{\"u}nchen",
  affiliationaddress = "M{\"u}nchen, Ger",
  classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel
                 programming); C6115 (Programming support); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems)",
  corpsource =   "Inst. f{\"u}r Inf., Tech. Univ. M{\"u}nchen, Germany",
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
  journalabr =   "Parallel Comput",
  keywords =     "applications; attributed measurements; Codes
                 (symbols); Computer debugging; Computer programming;
                 Computer simulation; debugger; debugging; DETOP;
                 Distributed computer systems; distributed evaluation;
                 Distributed online monitoring system; environments;
                 Interactive computer systems; Interactive debugging;
                 intrusion; massively parallel; Massively parallel
                 applications; minimal; monitoring system; multithreaded
                 programming models; Online systems; parallel; Parallel
                 debugger; Parallel processing systems; parallel
                 programming; Parallelization; PATOP; Performance;
                 performance analysis; Performance analysis; performance
                 analyzer; performance bottlenecks; Personal computers;
                 PowerPC; program debugging; programming; scalability;
                 software; software performance evaluation;
                 Supercomputers; tools; usability; User interfaces",
  treatment =    "P Practical",
}

@Article{Yam:1996:DPV,
  author =       "Michael Yam",
  title =        "{DCE} Pthreads versus {NT} Threads. {Michael} ports
                 {PTF}, a {C++} class library for {DCE} pthreads, from
                 {HP-UX System 9} to {Windows NT}. {In} doing so, he
                 examines the differences between pthreads and {NT}
                 threads, and describes the porting experience",
  journal =      j-DDJ,
  volume =       "21",
  number =       "12",
  pages =        "16--??",
  month =        dec,
  year =         "1996",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Mon Dec 2 07:52:21 MST 1996",
  bibsource =    "http://www.ddj.com/index/author/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Yoo:1996:CAA,
  author =       "H. Chuck Yoo",
  title =        "Comparative Analysis of Asynchronous {I/O} in
                 Multithreaded {UNIX}",
  journal =      j-SPE,
  volume =       "26",
  number =       "9",
  pages =        "987--997",
  month =        sep,
  year =         "1996",
  CODEN =        "SPEXBL",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Thu Jul 29 15:11:03 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/spe.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=16832",
  acknowledgement = ack-nhfb,
  fjournal =     "Software --- Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
}

@PhdThesis{Yoo:1996:PCM,
  author =       "Namhoon Yoo",
  title =        "Parallelism control in multithreaded multiprocessors",
  type =         "Thesis ({Ph.D.})",
  school =       "University of Southern California",
  address =      "Los Angeles, CA, USA",
  pages =        "x + 86",
  year =         "1996",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Data flow computing;
                 Multiprocessors; Parallel processing (Electronic
                 computers)",
}

@Book{Zignin:1996:TDM,
  author =       "Bernard Zignin",
  title =        "Techniques du multithread: du parall{\`e}lisme dans
                 les processus {(French) [Multithreading techniques:
                 parallelism in processes]}",
  publisher =    pub-HERMES,
  address =      pub-HERMES:adr,
  pages =        "72",
  year =         "1996",
  ISBN =         "2-86601-562-2",
  ISBN-13 =      "978-2-86601-562-6",
  LCCN =         "????",
  bibdate =      "Wed Dec 09 23:36:26 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "CNAM. Syntheses informatiques",
  acknowledgement = ack-nhfb,
  keywords =     "Parall{\`e}lisme (informatique)",
  language =     "French",
}

@Article{Anonymous:1997:NPW,
  author =       "Anonymous",
  title =        "New Products: {WebThreads 1.0.1; QUERYFLEX Report
                 Writer; Linux Pro Desktop 1.0; NDP Fortran for Linux;
                 Numerics and Visualization for Java; Craftworks
                 Linux/AXP 2.2; InfoDock Linux Software Development
                 Toolset; Caldera Wabi 2.2 for Linux}",
  journal =      j-LINUX-J,
  volume =       "34",
  pages =        "??--??",
  month =        feb,
  year =         "1997",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Fri Oct 9 08:35:26 MDT 1998",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue34/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Anonymous:1997:TNR,
  author =       "Anonymous",
  title =        "Technology News \& Reviews: {Chemkin} software;
                 {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab};
                 {Java} products; {Scientific WorkPlace 3.0}",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "4",
  number =       "4",
  pages =        "75--??",
  month =        oct # "\slash " # dec,
  year =         "1997",
  CODEN =        "ISCEE4",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java.bib;
                 https://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@Article{Anonymous:1997:TWP,
  author =       "Anonymous",
  title =        "Tech Watch --- Pattern-recognition system. {Piecing}
                 together history. {3D} semiconductor simulation.
                 {Multi}-threaded architecture",
  journal =      j-CG-WORLD,
  volume =       "20",
  number =       "9",
  pages =        "15--??",
  month =        sep,
  year =         "1997",
  CODEN =        "CGWODH",
  ISSN =         "0271-4159",
  bibdate =      "Sat Nov 7 10:32:27 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Graphics World",
}

@Article{Arvind:1997:MSC,
  author =       "Arvind and A. Caro and J.-W. Maessen and S. Aditya",
  title =        "A Multithreaded Substrate and Compilation Model for
                 the Implicitly Parallel Language {pH}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1239",
  pages =        "519--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Aug 22 11:59:49 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bednorz:1997:CDA,
  author =       "M. Bednorz and A. Gwozdowski and K. Zieli{\'n}ski",
  title =        "Contextual debugging and analysis of multithreaded
                 applications",
  journal =      j-CPE,
  volume =       "9",
  number =       "2",
  pages =        "123--139",
  month =        feb,
  year =         "1997",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:28 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13852;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13852&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Book{Beveridge:1997:MAW,
  author =       "Jim Beveridge and Robert Wiener",
  title =        "Multithreading applications in {Win32}: the complete
                 guide to threads",
  publisher =    pub-AWDP,
  address =      pub-AWDP:adr,
  pages =        "xviii + 368",
  year =         "1997",
  ISBN =         "0-201-44234-5 (pb) 0-201-18385-4 (CD-ROM)",
  ISBN-13 =      "978-0-201-44234-2 (pb) 978-0-201-18385-6 (CD-ROM)",
  LCCN =         "QA76.76.O63 B478 1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "System requirements: IBM compatible PC; Win32; Windows
                 NT or Windows 95; CD-ROM drive.",
  keywords =     "Microsoft Win32; Microsoft Windows (Computer file);
                 Microsoft Windows NT; Operating systems (Computers)",
}

@Article{Bik:1997:JPJ,
  author =       "Aart J. C. Bik and Juan E. Villacis and Dennis B.
                 Gannon",
  title =        "javar: a prototype {Java} restructuring compiler",
  journal =      j-CPE,
  volume =       "9",
  number =       "11",
  pages =        "1181--1191",
  month =        nov,
  year =         "1997",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:35 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  note =         "Special Issue: Java for computational science and
                 engineering --- simulation and modeling II.",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13819;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13819&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6110P
                 (Parallel programming); C6150C (Compilers, interpreters
                 and other processors)",
  conflocation = "Las Vegas, NV, USA; 21 June 1997",
  conftitle =    "Java for Computational Science and Engineering ---
                 Simulation and Modeling II",
  corpsource =   "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN,
                 USA",
  fjournal =     "Concurrency, practice and experience",
  keywords =     "annotations; explicit parallelism; functionality;
                 implicit parallelism; Java program parallelization;
                 Java restructuring compiler; javar; multi-threading;
                 object-oriented languages; parallelising compilers;
                 prototype; semantic analysis; software prototyping",
  pubcountry =   "UK",
  sponsororg =   "ACM",
  treatment =    "P Practical",
}

@Article{Bordawekar:1997:EEH,
  author =       "Rajesh Bordawekar and Steven Landherr and Don Capps
                 and Mark Davis",
  title =        "Experimental evaluation of the {Hewlett--Packard}
                 {Exemplar} file system",
  journal =      j-SIGMETRICS,
  volume =       "25",
  number =       "3",
  pages =        "21--28",
  month =        dec,
  year =         "1997",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/270900.270904",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:24:50 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This article presents results from an experimental
                 evaluation study of the HP Exemplar file system. Our
                 experiments consist of simple micro-benchmarks that
                 study the impact of various factors on the file system
                 performance. These factors include I/O request/buffer
                 sizes, vectored/non-vectored access patterns,
                 read-ahead policies, multi-threaded (temporally
                 irregular) requests, and architectural issues (cache
                 parameters, NUMA behavior, etc.). Experimental results
                 indicate that the Exemplar file system provides high
                 I/O bandwidth, both for single- and multi-threaded
                 applications. The buffer cache, with prioritized buffer
                 management and large buffer sizes, is effective in
                 exploiting temporal and spatial access localities. The
                 performance of non-contiguous accesses can be improved
                 by either using vectored I/O interfaces or tuning the
                 read-ahead facilities. The file system performance
                 depends on the relative locations of the computing
                 threads and the file system, and also on various
                 Exemplar design parameters such as the NUMA
                 architecture, TLB/data cache management and paging
                 policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Bramley:1997:TNRb,
  author =       "Randall Bramley",
  title =        "Technology News \& Reviews: {Chemkin} software;
                 {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab};
                 {Java} products; {Scientific WorkPlace 3.0}",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "4",
  number =       "4",
  pages =        "75--78",
  month =        oct # "\slash " # dec,
  year =         "1997",
  CODEN =        "ISCEE4",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputscieng.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
  remark =       "No DOI available: article missing from IEEE Xplore
                 database.",
}

@Book{Butenhof:1997:PPT,
  author =       "David R. Butenhof",
  title =        "Programming with {POSIX} threads",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "xviii + 381",
  year =         "1997",
  ISBN =         "0-201-63392-2",
  ISBN-13 =      "978-0-201-63392-4",
  LCCN =         "QA76.76.T55B88 1997",
  bibdate =      "Mon Sep 01 08:53:12 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$31.95",
  URL =          "http://www.amazon.com/exec/obidos/ASIN/0201633922/ref=sim_books/002-4892305-5599452",
  acknowledgement = ack-nhfb,
}

@Article{Calcote:1997:TPS,
  author =       "John Calcote",
  title =        "Thread Pools and Server Performance",
  journal =      j-DDJ,
  volume =       "22",
  number =       "7",
  pages =        "60--??",
  month =        jul,
  year =         "1997",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Sat Jun 28 10:43:47 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Cenciarelli:1997:SMJ,
  author =       "P. Cenciarelli and A. Knapp and B. Reus and M.
                 Wirsing",
  title =        "From sequential to multi-threaded {Java}: An
                 event-based operational semantics",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1349",
  pages =        "75--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cenciarelli:1997:SMT,
  author =       "P. Cenciarelli and A. Knapp and B. Reus and M.
                 Wirsing",
  title =        "From sequential to multi-threaded {Java}: An
                 event-based operational semantics",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1349",
  pages =        "75--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Dou:1997:ISV,
  author =       "Yong Dou and Zhengbing Pang and Xingming Zhou",
  title =        "Implementing a software virtual shared memory on
                 {PVM}",
  crossref =     "IEEE:1997:APD",
  pages =        "??--??",
  year =         "1997",
  bibdate =      "Wed Apr 16 06:39:19 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6120 (File organisation); C6140D (High level
                 languages); C7430 (Computer engineering)",
  corpsource =   "Dept. of Comput. Sci., Changsha Inst. of Technol.,
                 Hunan, China",
  keywords =     "distributed; FORTRAN; FORTRAN language; GKD-VSM;
                 memory environments; multithread scheme; parallel
                 programming; parallel programming model; Prefetch and
                 Poststore; programming environments; PVM; shared
                 memory; software overhead; software virtual shared
                 memory; synchronisation; user-level; virtual machines;
                 virtual storage",
  treatment =    "P Practical",
}

@Article{Eggers:1997:SMP,
  author =       "Susan J. Eggers and Joel S. Emer and Henry M. Levy and
                 Jack L. Lo and Rebecca L. Stamm and Dean M. Tullsen",
  title =        "Simultaneous Multithreading: a Platform for
                 Next-Generation Processors",
  journal =      j-IEEE-MICRO,
  volume =       "17",
  number =       "5",
  pages =        "12--19",
  month =        sep # "\slash " # oct,
  year =         "1997",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/40.621209",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Dec 14 06:08:58 MST 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Science Citation Index database (1980--2000)",
  URL =          "http://dlib.computer.org/mi/books/mi1997/pdf/m5012.pdf;
                 http://www.computer.org/micro/mi1997/m5012abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@TechReport{Eickemeyer:1997:EMP,
  author =       "Richard J. Eickemeyer",
  title =        "Evaluation of multithreaded processors and
                 thread-switch policies",
  type =         "Research report",
  number =       "RC 20956 (92759)",
  institution =  "IBM T. J. Watson Research Center",
  address =      "Yorktown Heights, NY, USA",
  pages =        "16",
  day =          "18",
  month =        aug,
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper examines the use of coarse-grained
                 multithreading to lessen the negative impact of memory
                 access latencies on the performance of uniprocessor
                 on-line transaction processing systems. It considers
                 the effect of switching threads on cache misses in a
                 two-level cache system. It also examines several
                 different thread-switch policies. The results suggest
                 that multithreading with a small number (3-5) of active
                 threads can significantly improve the performance of
                 such commercial environments.",
  acknowledgement = ack-nhfb,
  keywords =     "Cache memory; Computer architecture; Threads (Computer
                 programs)",
}

@Article{Emerson:1997:USW,
  author =       "E. A. Emerson and A. P. Sistla",
  title =        "Utilizing Symmetry when Model-Checking under Fairness
                 Assumptions: An Automata-Theoretic Approach",
  journal =      j-TOPLAS,
  volume =       "19",
  number =       "4",
  pages =        "617--638",
  month =        jul,
  year =         "1997",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Wed Dec 3 16:28:05 MST 1997",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/toplas/1997-19-4/p617-emerson/",
  abstract =     "One useful technique for combating the state explosion
                 problem is to exploit symmetry when performing temporal
                 logic model checking. In previous work it is shown how,
                 using some basic notions of group theory, symmetry may
                 be exploited for the full range of correctness
                 properties expressible in the very expressive temporal
                 logic CTL*. Surprisingly, while fairness properties are
                 readily expressible in CTL*, these methods are not
                 powerful enough to admit any amelioration of state
                 explosion, when fairness assumptions are involved. We
                 show that it is nonetheless possible to handle fairness
                 efficiently by trading some group theory for automata
                 theory. Our automata-theoretic approach depends on
                 detecting fair paths subtly encoded in a quotient
                 structure whose arcs are annotated with permutations,
                 by using a threaded structure that reflects coordinate
                 shifts caused by the permutations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "design; languages; theory; verification",
  subject =      "{\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS
                 OF PROGRAMS, Specifying and Verifying and Reasoning
                 about Programs. {\bf F.1.1} Theory of Computation,
                 COMPUTATION BY ABSTRACT DEVICES, Models of Computation.
                 {\bf D.2.4} Software, SOFTWARE ENGINEERING,
                 Software/Program Verification.",
}

@Article{Fillo:1997:MMM,
  author =       "Marco Fillo and Stephen W. Keckler and William J.
                 Dally and Nicholas P. Carter and Andrew Chang and
                 Yevgeny Gurevich and Whay S. Lee",
  title =        "The {M}-Machine Multicomputer",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "25",
  number =       "3",
  pages =        "183--212",
  month =        jun,
  year =         "1997",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Tue Apr 7 18:25:25 MDT 1998",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Massachusetts Inst of Technology",
  affiliationaddress = "Cambridge, MA, USA",
  classification = "714.2; 722; 722.1; 722.4; 723; 723.1",
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  journalabr =   "Int J Parallel Program",
  keywords =     "Buffer storage; Computer architecture; Data storage
                 equipment; Microprocessor chips; Multiprogramming;
                 Multithread processors; On chip cache; Parallel
                 processing systems; Synchronization; Thread level
                 parallelism; User interfaces",
}

@MastersThesis{Fisher:1997:SPS,
  author =       "Michael T. Fisher",
  title =        "A study of the performance of simultaneous
                 multithreading on a superscalar processor",
  type =         "Thesis ({M.S.E.E.})",
  number =       "2363",
  school =       "State University of New York at Binghamton, Watson
                 School of Engineering and Applied Science",
  address =      "Binghamton, NY, USA",
  pages =        "vi + 98",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Master's theses / State University of New York at
                 Binghamton",
  acknowledgement = ack-nhfb,
  alttitle =     "Simultaneous multithreading on a superscalar processor
                 Multithreading on a superscalar processor Superscalar
                 processor",
  keywords =     "Microprocessors -- Testing",
}

@MastersThesis{Fong:1997:BPM,
  author =       "Waipang Fong",
  title =        "Building a preprocessor for a multithreading
                 compiler",
  type =         "Thesis ({M.E.E.})",
  school =       "Department of Electrical Engineering, University of
                 Alabama",
  address =      "Tuscaloosa, AL, USA",
  pages =        "ix + 80",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors; Parallel processing (Electronic
                 computers)",
}

@Article{Forsell:1997:MMV,
  author =       "M. Forsell",
  title =        "{MTAC} --- a Multithreaded {VLIW} Architecture for
                 {PRAM} Simulation",
  journal =      j-J-UCS,
  volume =       "3",
  number =       "9",
  pages =        "1037--1055",
  day =          "28",
  month =        sep,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Wed Mar 4 15:32:49 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://medoc.springer.de:8000/jucs/jucs_3_9/mtac_a_multithreaded_vliw",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Article{Foster:1997:MMC,
  author =       "Ian Foster and Jonathan Geisler and Carl Kesselman and
                 Steven Tuecke",
  title =        "Managing Multiple Communication Methods in
                 High-Performance Networked Computing Systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "1",
  pages =        "35--48",
  day =          "10",
  month =        jan,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1266",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/ref",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C5640
                 (Protocols); C5670 (Network performance)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "Argonne MPICH library; computer networks; computing
                 systems; criteria; heterogeneous networked environment;
                 high-performance networked; message passing; message
                 passing interface; multimethod communication; multiple
                 communication methods; multithreaded runtime system;
                 networked computing environments; Nexus; Nexus-based
                 MPI implementation; performance characteristics;
                 performance evaluation; protocols; remote service
                 request mechanisms; transport mechanisms;
                 user-specified selection",
  treatment =    "P Practical",
}

@TechReport{Fujita:1997:MPA,
  author =       "Tetsuya Theodore Fujita",
  title =        "A multithreaded processor architecture for parallel
                 symbolic computation",
  type =         "Technical Report",
  number =       "MIT/LCS/TM-338",
  institution =  "Laboratory for Computer Science, Massachusetts
                 Institute of Technology",
  address =      "Cambridge, MA, USA",
  pages =        "71",
  month =        sep,
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer architecture; Multilisp (Computer program
                 language); Parallel processing (Electronic computers)",
}

@PhdThesis{Goldstein:1997:LTC,
  author =       "Seth Copen Goldstein",
  title =        "Lazy threads: compiler and runtime structures for
                 fine-grained parallel programming",
  type =         "Thesis ({Ph.D.})",
  number =       "UCB/CSD-97-975",
  school =       "Computer Science Division, University of California,
                 Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "xi + 174",
  year =         "1997",
  LCCN =         "TK7885.A1 R46 no.97:975",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Report",
  acknowledgement = ack-nhfb,
}

@Article{Gomez:1997:EMU,
  author =       "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam",
  title =        "Efficient Multithreaded User-Space Transport for
                 Network Computing: Design and Test of the {TRAP}
                 Protocol",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "1",
  pages =        "103--117",
  day =          "10",
  month =        jan,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1269",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production/ref",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5620 (Computer networks and
                 techniques); C5640 (Protocols); C6150G (Diagnostic,
                 testing, debugging and evaluating systems); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
                 IN, USA",
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
  keywords =     "communicating; communication; computer networks;
                 computing; computing nodes; efficient multithreaded
                 user-space transport; high-; low-latency; message
                 passing; multithreaded message-passing libraries;
                 network; nodes; performance distributed computing
                 applications; processing; runtime performance;
                 scalability characteristics; software libraries;
                 software performance evaluation; testing; transaction;
                 transaction-oriented protocol; transport protocols;
                 TRAP protocol design; TRAP protocol testing; TRAP-based
                 communication library; user-space protocol",
  treatment =    "P Practical",
}

@Article{Goossens:1997:MVC,
  author =       "B. Goossens",
  title =        "A Multithreaded Vector Co-processor",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1277",
  pages =        "311--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gorton:1997:GEI,
  author =       "Ian Gorton and Innes E. Jelly",
  title =        "{Guest Editors} Introduction: Software Engineering for
                 Parallel and Distributed Systems: Challenges and
                 Opportunities",
  journal =      j-IEEE-CONCURR,
  volume =       "5",
  number =       "3",
  pages =        "12--15",
  month =        jul # "\slash " # sep,
  year =         "1997",
  CODEN =        "IECMFX",
  ISSN =         "1092-3063 (print), 1558-0849 (electronic)",
  ISSN-L =       "1092-3063",
  bibdate =      "Tue Jan 16 06:04:48 MST 2001",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/ieeeconcurrency.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/pd/books/pd1997/pdf/p3012.pdf",
  acknowledgement = ack-nhfb,
  affiliation =  "Commonwealth Science and Industrial Research
                 Organization",
  affiliationaddress = "Aust",
  classification = "722; 722.4; 723; 723.1; 723.3",
  fjournal =     "IEEE Concurrency",
  journalabr =   "IEEE Concurrency",
  keywords =     "Computer workstations; Concurrency control; Fault
                 tolerant computer systems; High performance computing;
                 Multithreaded servers; Parallel processing systems;
                 Program debugging; Program diagnostics; Software
                 engineering; World wide web",
}

@Article{Gunther:1997:MDF,
  author =       "B. K. Gunther",
  title =        "Multithreading with distributed functional units",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "46",
  number =       "4",
  pages =        "399--411",
  month =        apr,
  year =         "1997",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.588034",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Jul 6 10:06:22 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=588034",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Manual{Haines:1997:DLT,
  author =       "Matthew Haines",
  title =        "On designing lightweight threads for substrate
                 software",
  number =       "201645",
  publisher =    pub-NTIS,
  address =      pub-NTIS:adr,
  pages =        "??",
  year =         "1997",
  LCCN =         "DOC NAS 1.26:201645 mf11",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Shipping list number 98-0847-M.",
  series =       "NASA contractor report",
  acknowledgement = ack-nhfb,
  keywords =     "operating systems (computers); parallel computers;
                 parallel processing (computers); threads",
}

@Article{Haines:1997:DPP,
  author =       "Matthew Haines and Piyush Mehrotra and David Cronk",
  title =        "Data-parallel programming in a multithreaded
                 environment",
  journal =      j-SCI-PROG,
  volume =       "6",
  number =       "2",
  pages =        "187--200",
  month =        "Summer",
  year =         "1997",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Haines:1997:OIA,
  author =       "Matthew Haines",
  title =        "An Open Implementation Analysis and Design for
                 Lightweight Threads",
  journal =      j-SIGPLAN,
  volume =       "32",
  number =       "10",
  pages =        "229--242",
  month =        oct,
  year =         "1997",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:39 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Book{Hanson:1997:CII,
  author =       "David R. Hanson",
  title =        "{C} Interfaces and Implementations: Techniques for
                 Creating Reusable Software",
  publisher =    pub-AW,
  address =      pub-AW:adr,
  pages =        "xvii + 519",
  year =         "1997",
  ISBN =         "0-201-49841-3",
  ISBN-13 =      "978-0-201-49841-7",
  LCCN =         "QA76.73.C15H37 1997",
  bibdate =      "Fri Feb 27 16:08:11 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$37.95",
  series =       "Addison-Wesley Professional Computing Series",
  URL =          "http://www.cs.princeton.edu/software/cii/",
  acknowledgement = ack-nhfb,
  annote =       "Multithreading is discussed in Chapter 20.",
}

@Article{Hendren:1997:CCE,
  author =       "Laurie J. Hendren and Xinan Tang and Yingchun Zhu and
                 Shereen Ghobrial and Guang R. Gao and Xun Xue and
                 Haiying Cai and Pierre Ouellet",
  title =        "Compiling {C} for the {EARTH} Multithreaded
                 Architecture",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "25",
  number =       "4",
  pages =        "305--338",
  month =        aug,
  year =         "1997",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Tue Apr 7 18:25:25 MDT 1998",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "McGill Univ",
  affiliationaddress = "Montreal, Que, Can",
  classification = "722; 722.4; 723; 723.1; 723.1.1; 723.2",
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  journalabr =   "Int J Parallel Program",
  keywords =     "C (programming language); Codes (symbols); Computer
                 architecture; earth C programming language;
                 Multithreaded architecture; Parallel processing
                 systems; Program compilers; Program translators",
}

@Article{Hightower:1997:PDD,
  author =       "Lauren Hightower",
  title =        "Publishing Dynamic Data on the {Internet} ---
                 {Allaire's Cold Fusion} is a development tool that
                 provides access (via the {Web}) to any database the
                 {Web} server can access using {ODBC}. {Cold Fusion}
                 runs as a multithreaded {Windows NT} system service and
                 works with any {ODBC-compliant} database",
  journal =      j-DDJ,
  volume =       "22",
  number =       "1",
  pages =        "70--??",
  month =        jan,
  year =         "1997",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Fri Jan 3 06:17:24 MST 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Book{Hughes:1997:OOM,
  author =       "Cameron Hughes and Tracey Hughes",
  title =        "Object-oriented multithreading using {C++}",
  publisher =    pub-WILEY,
  address =      pub-WILEY:adr,
  pages =        "xvi + 495",
  year =         "1997",
  ISBN =         "0-471-18012-2 (paperback)",
  ISBN-13 =      "978-0-471-18012-8 (paperback)",
  LCCN =         "QA76.73.C153H84 1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "System requirements: Windows 95, or OS/2 2.0 and
                 above, or UNIX, or system with POSIX pthreads; ANSI/ISO
                 compliant C++ compiler.",
  keywords =     "C++ (Computer program language); POSIX (Computer
                 software standard); Threads (Computer programs)",
}

@Article{Kacsuk:1997:MIC,
  author =       "P. Kacsuk and M. Amamiya",
  title =        "A Multithreaded Implementation Concept of {Prolog} on
                 {Datarol-II} Machine",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1336",
  pages =        "91--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kasperink:1997:CDC,
  author =       "Harold R. Kasperink and John C. Dekker",
  title =        "Concurrent Database Commands and {C++}",
  journal =      j-DDJ,
  volume =       "22",
  number =       "8",
  pages =        "84, 86, 88, 89, 98",
  month =        aug,
  year =         "1997",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Sat Aug 23 07:57:02 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Mapping design problems to programming problems leads
                 to software solutions that are easy to extend and
                 reuse. Our authors explain how they resolved
                 multithreaded porting problems using design patterns.
                 The database they use is Oracle and the database
                 transactions are implemented using Oracle ProC as an
                 embedded database command language.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@MastersThesis{Khosla:1997:MAT,
  author =       "Samir Khosla",
  title =        "Multithreading the asynchronous trigger processor",
  type =         "Thesis ({M.S.})",
  school =       "University of Florida",
  address =      "Gainesville, FL, USA",
  pages =        "ix + 57",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Kougiouris:1997:PMF,
  author =       "Panos Kougiouris and Marco Framba",
  title =        "A Portable Multithreading Framework",
  journal =      j-CCCUJ,
  volume =       "15",
  number =       "8",
  pages =        "??--??",
  month =        aug,
  year =         "1997",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Wed Aug 20 10:44:42 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Krieger:1997:HPO,
  author =       "Orran Krieger and Michael Stumm",
  title =        "{HFS}: a Performance-Oriented Flexible File System
                 Based on Building-Block Compositions",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "286--321",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p286-krieger/",
  abstract =     "The Hurricane File System (HFS) is designed for
                 (potentially large-scale) shared-memory
                 multiprocessors. Its architecture is based on the
                 principle that, in order to maximize performance for
                 applications with diverse requirements, a file system
                 must support a wide variety of file structures, file
                 system policies, and I/O interfaces. Files in HFS are
                 implemented using simple building blocks composed in
                 potentially complex ways. This approach yields great
                 flexibility, allowing an application to customize the
                 structure and policies of a file to exactly meet its
                 requirements. As an extreme example, HFS allows a
                 file's structure to be optimized for concurrent
                 random-access write-only operations by 10 threads,
                 something no other file system can do. Similarly, the
                 prefetching, locking, and file cache management
                 policies can all be chosen to match an application's
                 access pattern. In contrast, most parallel file systems
                 support a single file structure and a small set of
                 policies. We have implemented HFS as part of the
                 Hurricane operating system running on the Hector
                 shared-memory multiprocessor. We demonstrate that the
                 flexibility of HFS comes with little processing or I/O
                 overhead. We also show that for a number of file access
                 patterns, HFS is able to deliver to the applications
                 the full I/O bandwidth of the disks on our system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, File organization. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, Access
                 methods. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf E.5} Data, FILES,
                 Optimization**. {\bf E.5} Data, FILES,
                 Organization/structure.",
}

@Article{Kwak:1997:VMN,
  author =       "H. Kwak and B. Lee and A. R. Hurson",
  title =        "Viability of Multithreading on Networks of
                 Workstations",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1277",
  pages =        "216--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@PhdThesis{Lang:1997:MTE,
  author =       "Duncan Walter Temple Lang",
  title =        "A multi-threaded extension to a high level interactive
                 statistical computing environment",
  type =         "Thesis ({Ph.D. in Statistics})",
  school =       "University of California, Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "vii + 161",
  month =        dec,
  year =         "1997",
  LCCN =         "308t 1997 951",
  bibdate =      "Fri Aug 7 08:29:38 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Dissertations -- Academic -- UCB -- statistics --
                 1991--2000; University of California, Berkeley. Dept.
                 of Statistics -- Dissertations",
}

@Article{Larbi:1997:BRM,
  author =       "Michael Larbi",
  title =        "Book Review: {Multithreading Applications in Win32}",
  journal =      j-CCCUJ,
  volume =       "15",
  number =       "7",
  pages =        "65--??",
  month =        jul,
  year =         "1997",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Thu Jun 26 14:12:46 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Leiserson:1997:AAM,
  author =       "C. E. Leiserson",
  title =        "Algorithmic analysis of multithreaded algorithms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1350",
  pages =        "132--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Leven:1997:MIR,
  author =       "Peter J. Leven",
  title =        "A multithreaded implementation of a {Robot Control C
                 Library}",
  type =         "Thesis ({M.S.})",
  school =       "University of Illinois at Urbana-Champaign",
  address =      "Urbana-Champaign, IL, USA",
  pages =        "x + 72",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Lo:1997:CTL,
  author =       "Jack L. Lo and Joel S. Emer and Henry M. Levy and
                 Rebecca L. Stamm and Dean M. Tullsen",
  title =        "Converting Thread-Level Parallelism to
                 Instruction-Level Parallelism via Simultaneous
                 Multithreading",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "322--354",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/",
  abstract =     "To achieve high performance, contemporary computer
                 systems rely on two forms of parallelism:
                 instruction-level parallelism (ILP) and thread-level
                 parallelism (TLP). Wide-issue super-scalar processors
                 exploit ILP by executing multiple instructions from a
                 single program in a single cycle. Multiprocessors (MP)
                 exploit TLP by executing different threads in parallel
                 on different processors. Unfortunately, both parallel
                 processing styles statically partition processor
                 resources, thus preventing them from adapting to
                 dynamically changing levels of ILP and TLP in a
                 program. With insufficient TLP, processors in an MP
                 will be idle; with insufficient ILP, multiple-issue
                 hardware on a superscalar is wasted. This article
                 explores parallel processing on an alternative
                 architecture, simultaneous multithreading (SMT), which
                 allows multiple threads to complete for and share all
                 of the processor's resources every cycle. The most
                 compelling reason for running parallel applications on
                 an SMT processor is its ability to use thread-level
                 parallelism and instruction-level parallelism
                 interchangeably. By permitting multiple threads to
                 share the processor's functional units simultaneously,
                 the processor can use both ILP and TLP to accommodate
                 variations in parallelism. When a program has only a
                 single thread, all of the SMT processor's resources can
                 be dedicated to that thread; when more TLP exists, this
                 parallelism can compensate for a lack of per-thread
                 ILP. We examine two alternative on-chip parallel
                 architectures for the next generation of processors. We
                 compare SMT and small-scale, on-chip multiprocessors in
                 their ability to exploit both ILP and TLP. First, we
                 identify the hardware bottlenecks that prevent
                 multiprocessors from effectively exploiting ILP. Then,
                 we show that because of its dynamic resource sharing,
                 SMT avoids these inefficiencies and benefits from being
                 able to run more threads on a single processor. The use
                 of TLP is especially advantageous when per-thread ILP
                 is limited. The ease of adding additional thread
                 contexts on an SMT (relative to adding additional
                 processors on an MP) allows simultaneous multithreading
                 to expose more parallelism, further increasing
                 functional unit utilization and attaining a 52\%
                 average speedup (versus a four-processor, single-chip
                 multiprocessor with comparable execution resources).
                 This study also addresses an often-cited concern
                 regarding the use of thread-level parallelism or
                 multithreading: interference in the memory system and
                 branch prediction hardware. We find the multiple
                 threads cause interthread interference in the caches
                 and place greater demands on the memory system, thus
                 increasing average memory latencies. By exploiting
                 threading-level parallelism, however, SMT hides these
                 additional latencies, so that they only have a small
                 impact on total program performance. We also find that
                 for parallel applications, the additional threads have
                 minimal effects on branch prediction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors), Parallel processors**. {\bf C.0}
                 Computer Systems Organization, GENERAL, Instruction set
                 design. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management.",
}

@Article{Lo:1997:CTP,
  author =       "Jack L. Lo and Joel S. Emer and Henry M. Levy and
                 Rebecca L. Stamm and Dean M. Tullsen",
  title =        "Converting Thread-Level Parallelism to
                 Instruction-Level Parallelism via Simultaneous
                 Multithreading",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "322--354",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/",
  abstract =     "To achieve high performance, contemporary computer
                 systems rely on two forms of parallelism:
                 instruction-level parallelism (ILP) and thread-level
                 parallelism (TLP). Wide-issue super-scalar processors
                 exploit ILP by executing multiple instructions from a
                 single program in a single cycle. Multiprocessors (MP)
                 exploit TLP by executing different threads in parallel
                 on different processors. Unfortunately, both parallel
                 processing styles statically partition processor
                 resources, thus preventing them from adapting to
                 dynamically changing levels of ILP and TLP in a
                 program. With insufficient TLP, processors in an MP
                 will be idle; with insufficient ILP, multiple-issue
                 hardware on a superscalar is wasted. This article
                 explores parallel processing on an alternative
                 architecture, simultaneous multithreading (SMT), which
                 allows multiple threads to complete for and share all
                 of the processor's resources every cycle. The most
                 compelling reason for running parallel applications on
                 an SMT processor is its ability to use thread-level
                 parallelism and instruction-level parallelism
                 interchangeably. By permitting multiple threads to
                 share the processor's functional units simultaneously,
                 the processor can use both ILP and TLP to accommodate
                 variations in parallelism. When a program has only a
                 single thread, all of the SMT processor's resources can
                 be dedicated to that thread; when more TLP exists, this
                 parallelism can compensate for a lack of per-thread
                 ILP. We examine two alternative on-chip parallel
                 architectures for the next generation of processors. We
                 compare SMT and small-scale, on-chip multiprocessors in
                 their ability to exploit both ILP and TLP. First, we
                 identify the hardware bottlenecks that prevent
                 multiprocessors from effectively exploiting ILP. Then,
                 we show that because of its dynamic resource sharing,
                 SMT avoids these inefficiencies and benefits from being
                 able to run more threads on a single processor. The use
                 of TLP is especially advantageous when per-thread ILP
                 is limited. The ease of adding additional thread
                 contexts on an SMT (relative to adding additional
                 processors on an MP) allows simultaneous multithreading
                 to expose more parallelism, further increasing
                 functional unit utilization and attaining a 52\%
                 average speedup (versus a four-processor, single-chip
                 multiprocessor with comparable execution resources).
                 This study also addresses an often-cited concern
                 regarding the use of thread-level parallelism or
                 multithreading: interference in the memory system and
                 branch prediction hardware. We find the multiple
                 threads cause interthread interference in the caches
                 and place greater demands on the memory system, thus
                 increasing average memory latencies. By exploiting
                 threading-level parallelism, however, SMT hides these
                 additional latencies, so that they only have a small
                 impact on total program performance. We also find that
                 for parallel applications, the additional threads have
                 minimal effects on branch prediction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors), Parallel processors**. {\bf C.0}
                 Computer Systems Organization, GENERAL, Instruction set
                 design. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management.",
}

@TechReport{LoCocero:1997:MML,
  author =       "Joseph LoCocero and D. E. (Donald E.) Thomas",
  title =        "A multithreaded, multiple language hardware\slash
                 software cosimulator",
  type =         "Research report",
  number =       "CMUCAD-97-13",
  institution =  "Center for Electronic Design Automation, Carnegie
                 Mellon University",
  address =      "Pittsburgh, PA, USA",
  pages =        "7",
  month =        apr,
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Functional verification of mixed hardware/software
                 systems is vital to guaranteeing a correct, operational
                 system. This paper discusses a new multithreaded,
                 multiple-language cosimulator that directly combines
                 Verilog and C/C++, the native languages most often used
                 by hardware and software designers. The interface
                 between the two languages is specified in detail, as
                 are some illustrative examples. The performance is
                 shown to be clearly better than UNIX socket-based
                 cosimulation approaches. Further, it naturally fits a
                 cosimulation environment where arbitrary C++ programs
                 and Verilog descriptions are developed concurrently.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by Semiconductor Research
                 Corporation.",
  keywords =     "C (Computer program language); Embedded computer
                 systems -- Simulation methods; Verilog (Computer
                 hardware description language)",
}

@Article{Loeffler:1997:MJF,
  author =       "G. Loeffler",
  title =        "A Multithreaded {Java} Framework for Solving Linear
                 Elliptic Partial Differential Equations in {3D}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1343",
  pages =        "121--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Lundberg:1997:BMC,
  author =       "L. Lundberg",
  title =        "Bounding the Minimal Completion Time of Static
                 Mappings of Multithreaded {Solaris} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "1034--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Mateosian:1997:MNT,
  author =       "R. M. Mateosian",
  title =        "Micro News: {DARPA} aids {Tera MTA}",
  journal =      j-IEEE-MICRO,
  volume =       "17",
  number =       "5",
  pages =        "5--6",
  month =        sep # "\slash " # oct,
  year =         "1997",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.1997.621216",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Dec 14 06:08:58 MST 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Science Citation Index database (1980--2000)",
  URL =          "http://dlib.computer.org/mi/books/mi1997/pdf/m5005.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{McCarthy:1997:MTI,
  author =       "Martin McCarthy",
  title =        "Multi-Threading: Intermediate Concepts",
  journal =      j-LINUX-J,
  volume =       "36",
  pages =        "??--??",
  month =        apr,
  year =         "1997",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Fri Oct 9 08:35:26 MDT 1998",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue36/index.html;
                 https://www.math.utah.edu/pub/tex/bib/linux-journal.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "ftp://ftp.ssc.com/pub/lj/listings/issue36/2121.tgz",
  abstract =     "This second part of a series on Multi-threading deals
                 with how to use C programs with one of the POSIX
                 packages available for Linux to handle signals and
                 concurrent threads in global data.",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux Journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{McCarthy:1997:WMT,
  author =       "Martin McCarthy",
  title =        "What is Multi-Threading?",
  journal =      j-LINUX-J,
  volume =       "34",
  pages =        "??--??",
  month =        feb,
  year =         "1997",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Fri Oct 9 08:35:26 MDT 1998",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue34/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A primer on multi-threading: the process whereby Linux
                 manages several tasks simultaneously.",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{McMillan:1997:NSB,
  author =       "Robert McMillan",
  title =        "News: {Sun} boosts {Java} performance, adding {JIT}
                 compiler and {JVM} with multithreading to {Solaris
                 2.6}",
  journal =      j-JAVAWORLD,
  volume =       "2",
  number =       "7",
  pages =        "??--??",
  month =        jul,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 14:52:27 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-speedway.htm",
  acknowledgement = ack-nhfb,
}

@Article{Moreno:1997:PMP,
  author =       "E. D. Moreno and S. T. Kofuji and M. H. Cintra",
  title =        "Prefetching and Multithreading Performance in
                 Bus-Based Multiprocessors with {Petri} Nets",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1300",
  pages =        "1017--??",
  year =         "1997",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 28 08:51:33 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Neves:1997:TRS,
  author =       "Richard Neves and Robert B. Schnabel",
  title =        "Threaded Runtime Support for Execution of Fine Grain
                 Parallel Code on Coarse Grain Multiprocessors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "42",
  number =       "2",
  pages =        "128--142",
  day =          "1",
  month =        may,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1997.1322",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:02 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Book{Oaks:1997:JT,
  author =       "Scott Oaks and Henry Wong",
  title =        "{Java} threads",
  publisher =    pub-ORA,
  address =      pub-ORA:adr,
  pages =        "xiii + 252",
  year =         "1997",
  ISBN =         "1-56592-216-6",
  ISBN-13 =      "978-1-56592-216-7",
  LCCN =         "QA76.73.J38 O25 1997",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Java series",
  acknowledgement = ack-nhfb,
  keywords =     "java (computer program language); threads (computer
                 programs)",
}

@MastersThesis{Ongwattanakul:1997:RDM,
  author =       "Songpol Ongwattanakul",
  title =        "A runtime distributed multithreading library for the
                 {PARC} language",
  type =         "Thesis ({M.E.E.})",
  school =       "Department of Electrical Engineering, University of
                 Alabama",
  address =      "Tuscaloosa, AL, USA",
  pages =        "viii + 71",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Multiprocessors; Parallel processing (Electronic
                 computers)",
}

@Article{Onion:1997:MM,
  author =       "F. Onion",
  title =        "Multithreading in {MFC}",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "9",
  number =       "3",
  pages =        "50--53, 56",
  month =        mar,
  year =         "1997",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Thu Apr 24 09:46:14 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6115
                 (Programming support); C6150J (Operating systems)",
  fjournal =     "C++ Report",
  keywords =     "API calls; application program interfaces; Internet
                 queries; MFC; multiprogramming; multithreaded
                 programming; object oriented programming;
                 object-oriented programming; remote database hits;
                 software libraries; software tools; threads; user
                 interface; user interfaces; Windows",
  treatment =    "P Practical",
}

@Article{Park:1997:HPM,
  author =       "Sung-Yong Park and Salim Hariri",
  title =        "A High Performance Message Passing System for {Network
                 of Workstations}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "11",
  number =       "2",
  pages =        "159--180",
  month =        oct,
  year =         "1997",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1023/A:1007912007767",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 6 12:13:07 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=11&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.wkap.nl/issuetoc.htm/0920-8542+11+2+1997",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=11&issue=2&spage=159;
                 http://www.wkap.nl/oasis.htm/149826",
  acknowledgement = ack-nhfb,
  classification = "C5620W (Other computer networks); C6150N
                 (Distributed systems software)",
  corpsource =   "Dept. of Electr. and Comput. Eng., Syracuse Univ., NY,
                 USA",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "application programming interface; asynchronous
                 transfer mode; ATM; ATM network; device driver;
                 distributed computing; high performance; message
                 passing; message-passing system; multithreaded
                 message-passing system; NCS; network of workstations;
                 NOW environment; NYNET; wide area network; wide area
                 networks",
  pubcountry =   "Netherlands",
  treatment =    "P Practical",
}

@Book{Prasad:1997:MPT,
  author =       "Shashi Prasad",
  title =        "Multithreading programming techniques",
  publisher =    pub-MCGRAW-HILL,
  address =      pub-MCGRAW-HILL:adr,
  pages =        "xix + 410",
  year =         "1997",
  ISBN =         "0-07-912250-7, 0-07-050710-4 (Computer disk)",
  ISBN-13 =      "978-0-07-912250-6, 978-0-07-050710-4 (Computer disk)",
  LCCN =         "QA76.76.D47 P72 1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "The J. Ranade workstation series",
  acknowledgement = ack-nhfb,
  annote =       "System requirements: C programming language.",
  keywords =     "Application software -- Development; C (Computer
                 program language); Cross-platform software
                 development",
}

@Article{Ravoor:1997:MTP,
  author =       "Suresh B. Ravoor and Johnny S. K. Wong",
  title =        "Multithreaded Transaction Processing in Distributed
                 Systems",
  journal =      j-J-SYST-SOFTW,
  volume =       "38",
  number =       "2",
  pages =        "107--117",
  month =        aug,
  year =         "1997",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Wed Dec 16 08:24:49 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of systems and software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212",
}

@Article{Savage:1997:EDD,
  author =       "Stefan Savage and Michael Burrows and Greg Nelson and
                 Patrick Sobalvarro and Thomas Anderson",
  title =        "{Eraser}: a Dynamic Data Race Detector for
                 Multithreaded Programs",
  journal =      j-TOCS,
  volume =       "15",
  number =       "4",
  pages =        "391--411",
  month =        nov,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Co-published in {\em Operating Systems Review}, {\bf
                 31}(5).",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p391-savage/",
  abstract =     "Multithreaded programming is difficult and error
                 prone. It is easy to make a mistake in synchronization
                 that produces a data race, yet it can be extremely hard
                 to locate this mistake during debugging. This article
                 describes a new tool, called Eraser, for dynamically
                 detecting data races in lock-based multithreaded
                 programs. Eraser uses binary rewriting techniques to
                 monitor every shared-monory reference and verify that
                 consistent locking behavior is observed. We present
                 several case studies, including undergraduate
                 coursework and a multithreaded Web search engine, that
                 demonstrate the effectiveness of this approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; reliability",
  subject =      "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
                 and Debugging, Monitors. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Parallel programming. {\bf D.2.5} Software, SOFTWARE
                 ENGINEERING, Testing and Debugging, Debugging aids.
                 {\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and
                 Debugging, Tracing. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Deadlocks. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Mutual exclusion.",
}

@Article{Shepherd:1997:UCA,
  author =       "George Shepherd and Scot Wingo",
  title =        "Undocumented Corner: {ATL} and the {IUknown}
                 Interface",
  journal =      j-DDJ,
  volume =       "22",
  number =       "8",
  pages =        "119--123",
  month =        aug,
  year =         "1997",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Mon Aug 11 11:38:10 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "George and Scot continue their examination of
                 Microsoft's Active Template Library, this month looking
                 at the heart of ATL, including its support for
                 multithreading and its various implementations of
                 IUnknown.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Shoffner:1997:JSSa,
  author =       "Michael Shoffner",
  title =        "{Java} Step by Step: Write your own threaded
                 discussion forum",
  journal =      j-JAVAWORLD,
  volume =       "2",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 14:52:24 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-02-1997/jw-02-step.htm",
  acknowledgement = ack-nhfb,
}

@Article{Shoffner:1997:JSSb,
  author =       "Michael Shoffner",
  title =        "{Java} Step By Step: Write your own threaded
                 discussion forum: The communications and server
                 components",
  journal =      j-JAVAWORLD,
  volume =       "2",
  number =       "3",
  pages =        "??--??",
  month =        mar,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 14:52:25 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-03-1997/jw-03-step.htm",
  acknowledgement = ack-nhfb,
}

@Article{Sime:1997:GPM,
  author =       "J. Sime",
  title =        "Guarded pointers: moving smart pointers into
                 multithreaded systems",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "9",
  number =       "4",
  pages =        "32--41",
  month =        apr,
  year =         "1997",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Thu Apr 24 09:46:14 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110J (Object-oriented programming); C6120 (File
                 organisation); C6130 (Data handling techniques); C6150N
                 (Distributed systems software)",
  fjournal =     "C++ Report",
  keywords =     "abstract data types; C listings; concurrency control;
                 concurrency control pattern; data integrity; exception
                 handling; guarded pointers; multiprogramming;
                 multithreaded systems; object-oriented programming;
                 protected data resource; protection proxy pattern;
                 reference count lock; safety; smart pointers; thread
                 safety mechanisms",
  treatment =    "P Practical",
}

@Article{Sinharoy:1997:OTC,
  author =       "Balaram Sinharoy",
  title =        "Optimized Thread Creation for Processor
                 Multithreading",
  journal =      j-COMP-J,
  volume =       "40",
  number =       "6",
  pages =        "388--??",
  month =        "????",
  year =         "1997",
  CODEN =        "CMPJA6",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Wed Jul 21 09:55:15 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compj1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.index.html",
  URL =          "http://www.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.body.html#AbstractSinharoy;
                 http://www3.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.body.html#AbstractSinharoy",
  acknowledgement = ack-nhfb,
  email-1 =      "balaram@watson.ibm.com",
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
}

@Article{Sodan:1997:ENN,
  author =       "Angela Sodan and Guang R. Gao and Olivier Maquelin and
                 Jens-Uwe Schultz and Xin-Min Tian",
  title =        "Experiences with Non-numeric Applications on
                 Multithreaded Architectures",
  journal =      j-SIGPLAN,
  volume =       "32",
  number =       "7",
  pages =        "124--135",
  month =        jul,
  year =         "1997",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:35 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan1990.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Sohn:1997:DWD,
  author =       "Andrew Sohn and Mitsuhisa Sato and Namhoon Yoo and
                 Jean-Luc Gaudiot",
  title =        "Data and Workload Distribution in a Multithreaded
                 Architecture",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "40",
  number =       "2",
  pages =        "256--264",
  day =          "1",
  month =        feb,
  year =         "1997",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1996.1262",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:02 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Stewart:1997:MDH,
  author =       "David B. Stewart and Pradeep K. Khosla",
  title =        "Mechanisms for Detecting and Handling Timing Errors",
  journal =      j-CACM,
  volume =       "40",
  number =       "1",
  pages =        "87--93",
  month =        jan,
  year =         "1997",
  CODEN =        "CACMA2",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Fri Oct 10 18:17:54 MDT 1997",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/cacm/1997-40-1/p87-stewart/",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6130
                 (Data handling techniques); C6150J (Operating
                 systems)",
  corpsource =   "Inst. for Adv. Comput. Studies, Maryland Univ.,
                 College Park, MD, USA",
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
  keywords =     "adaptive real-time scheduling; aperiodic servers;
                 Chimera; design; error handling; imprecise computation;
                 low-overhead policy-independent system; management;
                 operating systems (computers); performance; periodic
                 threads; real- time operating system; real-time
                 systems; real-time systems analysis; real-time threads;
                 reliability; scheduling; scheduling policies; software
                 fault tolerance; specifications; system failure;
                 theory; timing; timing error detection; worst-case
                 execution times",
  subject =      "{\bf K.6.3} Computing Milieux, MANAGEMENT OF COMPUTING
                 AND INFORMATION SYSTEMS, Software Management, Software
                 development. {\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS,
                 Real-time systems. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS.",
  treatment =    "P Practical",
}

@Article{Taura:1997:FGM,
  author =       "Kenjiro Taura and Akinori Yonezawa",
  title =        "Fine-grain Multithreading with Minimal Compiler
                 Support --- a Cost Effective Approach to Implementing
                 Efficient Multithreading Languages",
  journal =      j-SIGPLAN,
  volume =       "32",
  number =       "5",
  pages =        "320--333",
  month =        may,
  year =         "1997",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-907-6",
  ISBN-13 =      "978-0-89791-907-4",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 13 12:37:28 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/258915/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan1990.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/258915/p320-taura/",
  acknowledgement = ack-nhfb,
  annote =       "Published as part of the Proceedings of PLDI'97.",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "design; languages; measurement; performance;
                 standardization; theory",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Compilers. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Data types and structures. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications. {\bf D.3.4} Software, PROGRAMMING
                 LANGUAGES, Processors, Code generation. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols.",
}

@PhdThesis{TempleLang:1997:MTE,
  author =       "Duncan Walter {Temple Lang}",
  title =        "A multi-threaded extension to a high level interactive
                 statistical computing environment",
  type =         "Thesis ({Ph.D. in Statistics})",
  school =       "Dept. of Statistics, University of California,
                 Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "vii + 161",
  month =        dec,
  year =         "1997",
  bibdate =      "Sat Apr 20 11:15:46 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Thompson:1997:THP,
  author =       "P. Thompson and G. Bumgardner",
  title =        "{Threads.h++}: a portable {C++} library for
                 multithreaded programming",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "9",
  number =       "3",
  pages =        "24--37",
  month =        mar,
  year =         "1997",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Thu Apr 24 09:46:14 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110J
                 (Object-oriented programming); C6115 (Programming
                 support); C6150J (Operating systems)",
  fjournal =     "C++ Report",
  keywords =     "application development; application program
                 interfaces; C language; low-level procedural API;
                 multiprocessor machines; multiprogramming;
                 multithreaded programming; object-oriented
                 abstractions; object-oriented languages;
                 object-oriented programming; operating systems;
                 portable C++ library; responsive performance; software
                 libraries; software portability; synchronisation;
                 synchronization; thread control; thread creation;
                 Threads.h++; Web browsers",
  treatment =    "P Practical",
}

@Article{Thompson:1997:TPC,
  author =       "P. Thompson and G. Bumgardner",
  title =        "{Threads.h++}: a portable {C++} library for
                 multithreaded programming",
  journal =      j-C-PLUS-PLUS-REPORT,
  volume =       "9",
  number =       "3",
  pages =        "24--37",
  month =        mar,
  year =         "1997",
  CODEN =        "CRPTE7",
  ISSN =         "1040-6042",
  bibdate =      "Thu Apr 24 09:46:14 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110J
                 (Object-oriented programming); C6115 (Programming
                 support); C6150J (Operating systems)",
  fjournal =     "C++ Report",
  keywords =     "application development; application program
                 interfaces; C language; low-level procedural API;
                 multiprocessor machines; multiprogramming;
                 multithreaded programming; object-oriented
                 abstractions; object-oriented languages;
                 object-oriented programming; operating systems;
                 portable C++ library; responsive performance; software
                 libraries; software portability; synchronisation;
                 synchronization; thread control; thread creation;
                 Threads.h++; Web browsers",
  treatment =    "P Practical",
}

@TechReport{Tsai:1997:PSC,
  author =       "Jenn-Yuan Tsai",
  title =        "Performance study of a concurrent multithreaded
                 processor",
  type =         "Technical report",
  number =       "TR 97-034",
  institution =  "University of Minnesota, Dept. of Computer Science and
                 Engineering",
  address =      "Minneapolis, MN, USA",
  pages =        "24",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The performance of a concurrent multithreaded
                 architectural model, called superthreading [15], is
                 studied in this paper. It tries to integrate optimizing
                 compilation techniques and run-time hardware support to
                 exploit both thread-level and instruction-level
                 parallelism, as opposed to exploit only
                 instruction-level parallelism in existing superscalars.
                 The superthreaded architecture uses a thread pipelining
                 execution model to enhance the overlapping between
                 threads, and to facilitate data dependence enforcement
                 between threads through compiler-directed,
                 hardware-supported, thread-level control speculation
                 and run-time data dependence checking. We also evaluate
                 the performance of the superthreaded processor through
                 a detailed trace-driven simulator. Our results show
                 that the superthreaded execution model can obtain good
                 performance by exploiting both thread-level and
                 instruction-level parallelism in programs. We also
                 study the design parameters of its main system
                 components, such as the size of the memory buffer, the
                 bandwidth requirement of the communication links
                 between thread processing units, and the bandwidth
                 requirement of the shared data cache.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the National Science Foundation.
                 Supported in part by the U.S. Army Intelligence Center
                 and Fort Huachuca. Supported in part by a gift from
                 Intel Corporation",
  keywords =     "Compilers (Computer programs); Computer architecture;
                 Parallel processing (Electronic computers); Threads
                 (Computer programs)",
}

@TechReport{Tsai:1997:SIC,
  author =       "Jenn-Yuan Tsai",
  title =        "Superthreading: integrating compilation technology and
                 processor architecture for cost-effective concurrent
                 multithreading",
  type =         "Technical report",
  number =       "TR 97-033",
  institution =  "University of Minnesota, Dept. of Computer Science and
                 Engineering",
  address =      "Minneapolis, MN, USA",
  pages =        "16",
  day =          "29",
  month =        jan,
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As the number of transistors that can be integrated on
                 a single chip continues to grow, it is important for
                 computer architects to think beyond the traditional
                 approaches of deeper pipelines and wider instruction
                 issue units for improving performance. This
                 single-threaded execution model limits these approaches
                 to exploiting only the relatively small amount of
                 instruction-level parallelism available in application
                 programs. While integrating an entire multiprocessor
                 onto a single chip is feasible, this architecture is
                 limited to exploiting only relatively coarse-grained
                 heavy-weight parallelism. We propose the superthreaded
                 architecture as an excellent alternative for utilizing
                 the large number of transistors that will become
                 available on a single high-density chip. As a hybrid of
                 a wide-issue superscalar processor and a
                 multiprocessor-on-a-chip, this new concurrent
                 multithreading architecture can leverage the best of
                 existing and future parallel hardware and software
                 technologies. By incorporating speculation for control
                 dependences and run-time checking of data dependences,
                 the superthreaded architecture can exploit the multiple
                 granularities of parallelism available in
                 general-purpose application programs to reduce the
                 execution time of a single program.",
  acknowledgement = ack-nhfb,
  annote =       "Supported in part by the U.S. Army Intelligence Center
                 and Fort Huachuca. Supported in part by the National
                 Science Foundation. Supported in part by a gift from
                 the Intel Corporation",
  keywords =     "Compilers (Computer programs); Computer architecture;
                 Parallel processing (Electronic computers); Threads
                 (Computer programs)",
}

@Article{Vanhelsuwe:1997:BRJ,
  author =       "Laurence Vanhelsuw{\'e}",
  title =        "Book Review: The {Java} {Threads} {API} makes it to
                 print media",
  journal =      j-JAVAWORLD,
  volume =       "2",
  number =       "7",
  pages =        "??--??",
  month =        jul,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 14:52:27 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-threads.htm",
  acknowledgement = ack-nhfb,
}

@Article{Vanhelsuwe:1997:JPE,
  author =       "Laurence Vanhelsuw{\'e}",
  title =        "{JavaBeans}: properties, events, and thread safety",
  journal =      j-JAVAWORLD,
  volume =       "2",
  number =       "9",
  pages =        "??--??",
  month =        sep,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 14:52:28 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-09-1997/jw-09-raceconditions.htm",
  acknowledgement = ack-nhfb,
}

@Article{Venners:1997:UHH,
  author =       "Bill Venners",
  title =        "Under the Hood: How the {Java} virtual machine
                 performs thread synchronization",
  journal =      j-JAVAWORLD,
  volume =       "2",
  number =       "7",
  pages =        "??--??",
  month =        jul,
  year =         "1997",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 14:52:27 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-hood.htm",
  acknowledgement = ack-nhfb,
}

@Article{Vermeulen:1997:JDW,
  author =       "Alain Vermeulen",
  title =        "{Java} Deadlock: The woes of multithreaded design",
  journal =      j-DDJ,
  volume =       "22",
  number =       "9",
  pages =        "52, 54--56, 88, 89",
  month =        sep,
  year =         "1997",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Mon Aug 11 12:53:44 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Weisz:1997:MFA,
  author =       "Russell Weisz",
  title =        "More First Aid for the Thread Impaired: Cool Ways to
                 Take Advantage of Multithreading",
  journal =      j-MICROSOFT-SYS-J,
  volume =       "12",
  number =       "7",
  pages =        "33--??",
  month =        jul,
  year =         "1997",
  CODEN =        "MSJOED",
  ISSN =         "0889-9932",
  bibdate =      "Sat Nov 7 10:33:30 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Microsoft Systems Journal",
}

@Article{Whittaker:1997:TML,
  author =       "Steve Whittaker and Jerry Swanson and Jakov Kucan and
                 Candy Sidner",
  title =        "{TeleNotes}: managing lightweight interactions in the
                 desktop",
  journal =      j-TOCHI,
  volume =       "4",
  number =       "2",
  pages =        "137--168",
  month =        jun,
  year =         "1997",
  CODEN =        "ATCIF4",
  ISSN =         "1073-0516 (print), 1557-7325 (electronic)",
  ISSN-L =       "1073-0516",
  bibdate =      "Tue Jan 19 05:49:17 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tochi/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tochi/1997-4-2/p137-whittaker/",
  abstract =     "Communication theories and technology have tended to
                 focus on extended, formal meetings and have neglected a
                 prevalent and vital form of workplace communication ---
                 namely, lightweight communication. Unlike formal,
                 extended meetings, lightweight interaction is brief,
                 informal, unplanned, and intermittent. We analyze
                 naturalistic data from a study of work-place
                 communication and derive five design criteria for
                 lightweight interaction systems. These criteria require
                 that systems for lightweight interaction support {\em
                 conversational tracking, rapid connection}, the ability
                 to {\em leave a message}, {\em context management}, and
                 {\em shared real-time objects}. Using these criteria,
                 we evaluate existing interpersonal communications
                 technologies. We then describe an implementation of a
                 system (TeleNotes) that is designed to support
                 lightweight interaction by meeting these criteria. The
                 interface metaphor allows communications to be based
                 around desktop objects, resembling ``sticky notes.''
                 These objects are also organized into ``desktop piles''
                 to support conversational threads and provide
                 mechanisms for initiating real-time audio, video, and
                 application sharing. We conducted informal user testing
                 of several system prototypes. Based on our findings,
                 outstanding issues concerning theory and systems design
                 for communication systems are outlined --- in
                 particular, with regard to the issue of managing
                 conversations over time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer-Human Interaction",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J756",
  keywords =     "human factors",
  subject =      "{\bf H.5.3} Information Systems, INFORMATION
                 INTERFACES AND PRESENTATION, Group and Organization
                 Interfaces, Evaluation/methodology. {\bf H.1.2}
                 Information Systems, MODELS AND PRINCIPLES,
                 User/Machine Systems, Human factors. {\bf H.5.3}
                 Information Systems, INFORMATION INTERFACES AND
                 PRESENTATION, Group and Organization Interfaces,
                 Asynchronous interaction. {\bf I.3.6} Computing
                 Methodologies, COMPUTER GRAPHICS, Methodology and
                 Techniques, Interaction techniques. {\bf H.5.3}
                 Information Systems, INFORMATION INTERFACES AND
                 PRESENTATION, Group and Organization Interfaces,
                 Synchronous interaction. {\bf H.5.1} Information
                 Systems, INFORMATION INTERFACES AND PRESENTATION,
                 Multimedia Information Systems,
                 Evaluation/methodology.",
}

@Article{Wilson:1997:BTP,
  author =       "Greg Wilson",
  title =        "Bookshelf: Threads Primer: a Guide To Multithreaded
                 Programming",
  journal =      j-IEEE-SOFTWARE,
  volume =       "14",
  number =       "5",
  pages =        "116--116",
  month =        sep # "\slash " # oct,
  year =         "1997",
  CODEN =        "IESOEG",
  ISSN =         "0740-7459 (print), 0740-7459 (electronic)",
  ISSN-L =       "0740-7459",
  bibdate =      "Mon Sep 15 22:35:10 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/so/books/so1997/pdf/s5115.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Software",
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/software",
}

@MastersThesis{Yang:1997:MUA,
  author =       "Chia Wei Yang",
  title =        "A multi-context uniprocessor: another multithreaded
                 architecture",
  type =         "Thesis ({M.S.})",
  school =       "California Polytechnic State University",
  address =      "San Luis Obispo, CA, USA",
  pages =        "viii + 129",
  year =         "1997",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "Proposes a computer architecture model that adapts all
                 advantages from multithreaded models to a uniprocessor
                 environment.",
  keywords =     "Computer architecture; Multiprocessors; Parallel
                 processing (Electronic Computers)",
}

@Book{Adamo:1998:MTO,
  author =       "Jean-Marc Adamo",
  title =        "Multi-threaded object-oriented {MPI}-based message
                 passing interface: the {ARCH} library",
  volume =       "SECS 446",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xiv + 185",
  year =         "1998",
  ISBN =         "0-7923-8165-3",
  ISBN-13 =      "978-0-7923-8165-5",
  LCCN =         "TK5102.5.A293 1998",
  bibdate =      "Fri Aug 7 08:29:38 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$120.00",
  series =       "The Kluwer international series in engineering and
                 computer science",
  acknowledgement = ack-nhfb,
  keywords =     "data transmission systems; object-oriented programming
                 (computer science); threads (computer programs)",
  libnote =      "Not yet in my library.",
}

@Article{Aiex:1998:CMT,
  author =       "R. M. Aiex and S. L. Martins and C. C. Ribeiro and N.
                 D. L. R. Rodriguez",
  title =        "Cooperative Multi-thread Parallel Tabu Search with an
                 Application to Circuit Partitioning",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1457",
  pages =        "310--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Amaranth:1998:TBM,
  author =       "Paul Amaranth",
  title =        "A {Tcl}-based Multithreaded Test Harness",
  crossref =     "USENIX:1998:PSA",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Fri Oct 18 07:49:55 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://db.usenix.org/publications/library/proceedings/tcl98/amaranth.html",
  acknowledgement = ack-nhfb,
}

@Article{Anonymous:1998:MS,
  author =       "Anonymous",
  title =        "Multithreaded System",
  journal =      j-IEEE-MICRO,
  volume =       "18",
  number =       "3",
  pages =        "76--76",
  month =        may # "\slash " # jun,
  year =         "1998",
  CODEN =        "IEMIDZ",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Dec 14 06:08:58 MST 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Science Citation Index database (1980--2000)",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Anonymous:1998:NTS,
  author =       "Anonymous",
  title =        "New Tools: Software Development: {Uniscape}'s
                 Internationalization Library; {Global Technologies}'
                 {Unix-to-NT} Solution; {KAI}'s Multithreaded {Java}
                 Debugging Tool; {Price Systems}' Parametric Forecasting
                 Tool",
  journal =      j-COMPUTER,
  volume =       "31",
  number =       "6",
  pages =        "98, 102",
  month =        jun,
  year =         "1998",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Thu Jun 4 08:22:02 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/java.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/co/books/co1998/pdf/r6098.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Ball:1998:MTA,
  author =       "Steve Ball and John Miller Crawford",
  title =        "Multi-Threaded Assignment Surprises",
  journal =      j-JAVA-REPORT,
  volume =       "3",
  number =       "??",
  pages =        "??--??",
  month =        sep,
  year =         "1998",
  CODEN =        "JREPFI",
  ISSN =         "1086-4660",
  bibdate =      "Sat Dec 26 13:52:53 1998",
  bibsource =    "http://archive.javareport.com/9809/html/from_pages/index.shtml;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://archive.javareport.com/9809/html/from_pages/ftp_col1.shtml",
  abstract =     "A volatile brew is formed by mixing assignment and
                 threads. Perils and surprises lurk within the most
                 innocent-looking statement. We expose those perils and
                 surprises and point out where you need to proceed with
                 due caution to ensure the effective use of locked
                 objects.",
  acknowledgement = ack-nhfb,
}

@Article{Bangs:1998:BOS,
  author =       "Gaurav Bangs and Peter Druschel and Jeffrey C. Mogul",
  title =        "Better operating system features for faster network
                 servers",
  journal =      j-SIGMETRICS,
  volume =       "26",
  number =       "3",
  pages =        "23--30",
  month =        dec,
  year =         "1998",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/306225.306234",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:27:29 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Widely-used operating systems provide inadequate
                 support for large-scale Internet server applications.
                 Their algorithms and interfaces fail to efficiently
                 support either event-driven or multi-threaded servers.
                 They provide poor control over the scheduling and
                 management of machine resources, making it difficult to
                 provide robust and controlled service. We propose new
                 UNIX interfaces to improve scalability, and to provide
                 fine-grained scheduling and resource management.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@TechReport{Beebe:1998:BPA,
  author =       "Nelson H. F. Beebe",
  title =        "A Bibliography of Publications about Multithreading",
  institution =  inst-CSC,
  address =      inst-CSC:adr,
  pages =        "15",
  day =          "7",
  month =        aug,
  year =         "1998",
  bibdate =      "Sat Apr 11 10:26:14 1998",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/b/beebe-nelson-h-f.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "This report is updated frequently.",
  URL =          "https://www.math.utah.edu/pub/tex/bib/index-table-m.html#multithreading",
}

@Article{Biagioni:1998:SST,
  author =       "Edoardo Biagioni and Ken Cline and Peter Lee and Chris
                 Okasaki and Chris Stone",
  title =        "Safe-for-Space Threads in {Standard ML}",
  journal =      j-HIGHER-ORDER-SYMB-COMPUT,
  volume =       "11",
  number =       "2",
  pages =        "209--225",
  month =        dec,
  year =         "1998",
  CODEN =        "LSCOEX",
  DOI =          "https://doi.org/10.1023/A:1010016600604",
  ISSN =         "1388-3690 (print), 2212-0793 (electronic)",
  ISSN-L =       "1388-3690",
  bibdate =      "Wed Jul 6 15:50:28 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1388-3690&volume=11&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.wkap.nl/issuetoc.htm/1388-3690+11+2+1998;
                 OCLC Contents1st database",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=1388-3690&volume=11&issue=2&spage=209;
                 http://www.wkap.nl/oasis.htm/187569",
  acknowledgement = ack-nhfb,
  fjournal =     "Higher-Order and Symbolic Computation",
}

@TechReport{Bic:1998:MAD,
  author =       "Lubomir Bic and Michael B. Dillencourt and Munehiro
                 Fukuda",
  title =        "Mobile agents, {DSM}, coordination, and self-migrating
                 threads: a common framework",
  type =         "UCI-ICS technical report",
  number =       "98-33",
  institution =  "Information and Computer Science, University of
                 California, Irvine",
  address =      "Irvine, CA",
  pages =        "11",
  day =          "8",
  month =        oct,
  year =         "1998",
  LCCN =         "Z699 .C3 no.98-33",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "distributed shared memory; intelligent agents
                 (computer software)",
}

@Article{Blumofe:1998:SES,
  author =       "Robert D. Blumofe and Charles E. Leiserson",
  title =        "Space-Efficient Scheduling of Multithreaded
                 Computations",
  journal =      j-SIAM-J-COMPUT,
  volume =       "27",
  number =       "1",
  pages =        "202--229",
  month =        feb,
  year =         "1998",
  CODEN =        "SMJCAT",
  ISSN =         "0097-5397 (print), 1095-7111 (electronic)",
  ISSN-L =       "0097-5397",
  bibdate =      "Sat Dec 5 17:26:53 MST 1998",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toclist/SICOMP/27/1;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/25947",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Computing",
  journal-URL =  "http://epubs.siam.org/sicomp",
}

@InProceedings{Brunett:1998:IET,
  author =       "Sharon M. Brunett and John Thornley and Marrq
                 Ellenbecker",
  title =        "An Initial Evaluation of the {Tera} Multithreaded
                 Architecture and Programming System Using the {C3I}
                 Parallel Benchmark Suite",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Mar 06 06:27:47 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.supercomp.org/sc98/papers/",
  URL =          "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Brunett1063/Index.htm",
  acknowledgement = ack-nhfb,
}

@InProceedings{Caromel:1998:JFS,
  author =       "Denis Caromel and Julien Vayssiere",
  title =        "A {Java} Framework for Seamless Sequential,
                 Multi-threaded, and Distributed Programming",
  crossref =     "ACM:1998:AWJ",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Thu Apr 27 10:43:08 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.cs.ucsb.edu/conferences/java98/papers/javapp.pdf;
                 http://www.cs.ucsb.edu/conferences/java98/papers/javapp.ps",
  acknowledgement = ack-nhfb,
}

@Article{Chapman:1998:OHI,
  author =       "B. Chapman and P. Mehrotra",
  title =        "{OpenMP} and {HPF}: Integrating Two Paradigms",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1470",
  pages =        "650--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chen:1998:MTO,
  author =       "Jiajun Chen and Xiaodong Yuan and Guolian Zhengp",
  title =        "A multi-threaded object-oriented programming model",
  journal =      j-SIGSOFT,
  volume =       "23",
  number =       "3",
  pages =        "83--86",
  month =        may,
  year =         "1998",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/279437.279477",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:13:36 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft1990.bib",
  abstract =     "This paper presents a concurrent object-oriented
                 programming (COOP) model established around concurrent
                 objects which may have a body. Once an object with a
                 body is created, its body begins to run as a separate
                 execution thread of the object. Distinguished from some
                 active-object-based concurrent object-oriented models,
                 the object body in our model is not used for the
                 concurrency control of objects, but only as a mechanism
                 to introduce concurrent executions into OO model.
                 Concurrency control is specified by the attributes of
                 objects and the control codes are generated by a
                 compiling system based on these attributes. In
                 addition, objects should be designed in such a way that
                 they can be used in both sequential and concurrent
                 environments, no matter whether they have a body or
                 not. In our model, several execution threads may
                 coexist in an object and some synchronization
                 mechanisms are provided to control the concurrent
                 executions of these threads. The paper presents two
                 examples of concurrent programming with our model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Book{Cohen:1998:WMP,
  author =       "Aaron Cohen and Mike Woodring",
  title =        "{Win32} Multithreaded Programming",
  publisher =    pub-ORA,
  address =      pub-ORA:adr,
  pages =        "xv + 705",
  year =         "1998",
  ISBN =         "1-56592-296-4",
  ISBN-13 =      "978-1-56592-296-9",
  LCCN =         "QA76.76.O63 C633 1998",
  bibdate =      "Fri Aug 7 08:29:38 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$39.95",
  URL =          "http://www.ora.com/catalog/multithread/;
                 http://www.oreilly.com/catalog/multithread",
  acknowledgement = ack-nhfb,
  keywords =     "Microsoft Win32; Microsoft Windows (Computer file);
                 Operating systems (Computers)",
}

@Article{Criscolo:1998:JQ,
  author =       "Mike Criscolo",
  title =        "{Java Q\&A}: How Do {I} Queue {Java} Threads?",
  journal =      j-DDJ,
  volume =       "23",
  number =       "10",
  pages =        "127--129",
  month =        oct,
  year =         "1998",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Fri Sep 11 09:12:05 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt;
                 http://www.ddj.com/ftp/1998/1998_10/jqa108.zip",
  abstract =     "In examining queuing techniques in Java, Mike presents
                 one approach to multithreading he has implemented, and
                 examines the differences between centralized- and
                 distributed-queuing models. Additional resources
                 include jqa108.txt (listings) and jqa108.zip (source
                 code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Criscolo:1998:JQH,
  author =       "Mike Criscolo",
  title =        "{Java Q and A}: How Do {I} Queue {Java} Threads?",
  journal =      j-DDJ,
  volume =       "23",
  number =       "10",
  pages =        "127--129",
  month =        oct,
  year =         "1998",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Fri Sep 11 09:12:05 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt;
                 http://www.ddj.com/ftp/1998/1998_10/jqa108.zip",
  abstract =     "In examining queuing techniques in Java, Mike presents
                 one approach to multithreading he has implemented, and
                 examines the differences between centralized- and
                 distributed-queuing models. Additional resources
                 include jqa108.txt (listings) and jqa108.zip (source
                 code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Cromwell:1998:PBD,
  author =       "Jeff Cromwell",
  title =        "Programmer's Bookshelf: The Dawning of the Age of
                 Multithreading",
  journal =      j-DDJ,
  volume =       "23",
  number =       "9",
  pages =        "127, 129",
  month =        sep,
  year =         "1998",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Wed Aug 05 10:12:23 1998",
  bibsource =    "http://www.ddj.com/ddj/1998/1998_09/index.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "",
  abstract =     "Jeff's focus this month is multithreading, as he
                 examines {\em Multithreading Programming Techniques in
                 Win32}, by Jim Beveridge and R. Wiener, {\em
                 Object-Oriented Multithreading Using C++}, by Cameron
                 and Tracy Hughes, and {\em Multithreading Programming
                 Techniques}, by Shashi Prasad.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Dagum:1998:OIS,
  author =       "Leonardo Dagum and Ramesh Menon",
  title =        "{OpenMP}: An Industry-Standard {API} for Shared-Memory
                 Programming",
  journal =      j-IEEE-COMPUT-SCI-ENG,
  volume =       "5",
  number =       "1",
  pages =        "46--55",
  month =        jan # "\slash " # mar,
  year =         "1998",
  CODEN =        "ISCEE4",
  DOI =          "https://doi.org/10.1109/99.660313",
  ISSN =         "1070-9924 (print), 1558-190X (electronic)",
  ISSN-L =       "1070-9924",
  bibdate =      "Sat Jan 9 08:57:23 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/cs/books/cs1998/pdf/c1046.pdf;
                 http://www.computer.org/cse/cs1998/c1046abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computational Science \& Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
}

@Article{DeRusso:1998:MEH,
  author =       "Joe {DeRusso, III} and Peter Haggar",
  title =        "Multithreaded Exception Handling in {Java}",
  journal =      j-JAVA-REPORT,
  volume =       "3",
  number =       "??",
  pages =        "??--??",
  month =        aug,
  year =         "1998",
  CODEN =        "JREPFI",
  ISSN =         "1086-4660",
  bibdate =      "Sat Dec 26 13:52:53 1998",
  bibsource =    "http://archive.javareport.com/9808/html/from_pages/index.shtml;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://archive.javareport.com/9808/html/from_pages/ftp_feature.shtml",
  abstract =     "Introducing new classes and interfaces to be used when
                 writing multithreaded Java programs. These classes are
                 small, easy to use, and effectively enable you to
                 handle exceptions occurring on secondary threads.",
  acknowledgement = ack-nhfb,
}

@Article{Dyer:1998:CAS,
  author =       "Dave Dyer",
  title =        "Can {Assure} save {Java} from the perils of
                 multithreading?",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "10",
  pages =        "??--??",
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Mon Jan 4 06:11:43 MST 1999",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-10-1998/jw-10-assure.htm",
  acknowledgement = ack-nhfb,
}

@Article{Eskilson:1998:SMM,
  author =       "Jesper Eskilson and Mats Carlsson",
  title =        "{SICStus MT} --- a Multithreaded Execution
                 Environment for {SICStus Prolog}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1490",
  pages =        "36--53",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Feb 5 11:53:01 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1490.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1490/14900036.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1490/14900036.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Frigo:1998:ICM,
  author =       "Matteo Frigo and Charles E. Leiserson and Keith H.
                 Randall",
  title =        "The Implementation of the {Cilk-5} Multithreaded
                 Language",
  journal =      j-SIGPLAN,
  volume =       "33",
  number =       "5",
  pages =        "212--223",
  month =        may,
  year =         "1998",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-987-4",
  ISBN-13 =      "978-0-89791-987-6",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:47 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html;
                 http://www.cs.virginia.edu/pldi98/program.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p212-frigo/",
  acknowledgement = ack-nhfb,
  annote =       "Published as part of the Proceedings of PLDI'98.",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "algorithms; languages; performance",
  subject =      "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Concurrent, distributed, and parallel
                 languages. {\bf D.1.3} Software, PROGRAMMING
                 TECHNIQUES, Concurrent Programming, Parallel
                 programming. {\bf D.3.3} Software, PROGRAMMING
                 LANGUAGES, Language Constructs and Features, Control
                 structures. {\bf D.3.2} Software, PROGRAMMING
                 LANGUAGES, Language Classifications, C.",
}

@Article{Geary:1998:SM,
  author =       "David Geary",
  title =        "{Swing} and multithreading",
  journal =      j-JAVA-REPORT,
  volume =       "3",
  number =       "??",
  pages =        "??--??",
  month =        nov,
  year =         "1998",
  CODEN =        "JREPFI",
  ISSN =         "1086-4660",
  bibdate =      "Sat Dec 26 13:52:53 1998",
  bibsource =    "http://archive.javareport.com/9811/html/from_pages/index.shtml;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://archive.javareport.com/9811/html/from_pages/ftp_col1.shtml",
  abstract =     "Read about why Swing is not thread-safe and the
                 ramifications of a single-threaded design for
                 developers using Swing.",
  acknowledgement = ack-nhfb,
}

@Article{Girkar:1998:IIM,
  author =       "Milind Girkar and Mohammad R. Haghighat and Paul Grey
                 and Hideki Saito and Nicholas Stavrakos and Constantine
                 D. Polychronopoulos",
  title =        "{Illinois-Intel} Multithreading Library:
                 Multithreading Support for {Intel} Architecture Based
                 Multiprocessor Systems",
  journal =      j-INTEL-TECH-J,
  number =       "Q1",
  pages =        "15",
  year =         "1998",
  ISSN =         "1535-766X",
  bibdate =      "Fri Jun 01 06:02:08 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://developer.intel.com/technology/itj/q11998/articles/art_5.htm;
                 http://developer.intel.com/technology/itj/q11998/pdf/iml.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Golla:1998:CEB,
  author =       "Prasad N. Golla and Eric C. Lin",
  title =        "A comparison of the effect of branch prediction on
                 multithreaded and scalar architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "4",
  pages =        "3--11",
  month =        sep,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216475.1216476",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Speculative instructions execution requires dynamic
                 branch predictors to increase the performance of a
                 processor by executing from predicted branch target
                 routines. Conventional Scalar architectures such as the
                 Superscalar or Multiscalar architecture executes from a
                 single stream, while a Multithreaded architecture
                 executes from multiple streams at a time. Several
                 aggressive branch predictors have been proposed with
                 high prediction accuracies. Unfortunately, none of the
                 branch predictors can provide 100\% accuracy.
                 Therefore, there is an inherent limitation on
                 speculative execution in real implementation. In this
                 paper, we show that Multithreaded architecture is a
                 better candidate for utilizing speculative execution
                 than Scalar architectures. Generally the branch
                 prediction performance degradation is compounded for
                 larger window sizes on Scalar architectures, while for
                 a Multithreaded architecture, by increasing the number
                 of executing threads, we could sustain a higher
                 performance for a large aggregated speculative window
                 size. Hence, heavier workloads may increase performance
                 and utilization for Multithreaded architectures. We
                 present analytical and simulation results to support
                 our argument.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@TechReport{Golla:1998:CMR,
  author =       "Prasad N. Golla and Eric C. Lin",
  title =        "Cache memory requirements for multithreaded
                 uniprocessor architecture",
  type =         "Technical paper",
  number =       "98-CSE-03",
  institution =  "Dept. of Computer Science and Engineering, Southern
                 Methodist University",
  address =      "Dallas, TX, USA",
  pages =        "32",
  year =         "1998",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Gomez:1998:CAM,
  author =       "J. C. Gomez and E. Mascarenhas and V. Rego",
  title =        "The {CLAM} Approach to Multithreaded Communication on
                 Shared Memory Multiprocessors: Design and Experiments",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "9",
  number =       "1",
  pages =        "36--49",
  month =        jan,
  year =         "1998",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/71.655241",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Nov 6 12:31:15 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/td/books/td1998/pdf/l0036.pdf;
                 http://www.computer.org/tpds/td1998/l0036abs.htm",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5440 (Multiprocessing systems); C5640
                 (Protocols); C5670 (Network performance)",
  corpsource =   "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
                 IN, USA",
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
  keywords =     "CLAM approach; communications environment; message
                 passing; multithreaded communication; OS-level process;
                 performance evaluation; protocols; scalable
                 multiprotocol support; scheduling algorithms; shared
                 memory systems; shared-memory multiprocessors;
                 user-space protocols",
  treatment =    "A Application; P Practical",
}

@Article{Gruen:1998:NIS,
  author =       "T. Gruen and M. A. Hillebrand",
  title =        "{NAS} Integer Sort on Multi-threaded Shared Memory
                 Machines",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1470",
  pages =        "999--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Heber:1998:UMA,
  author =       "G. Heber and R. Biswas and P. Thulasiraman and G. R.
                 Gao",
  title =        "Using Multithreading for the Automatic Load Balancing
                 of Adaptive Finite Element Meshes",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1457",
  pages =        "132--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Holub:1998:PJTa,
  author =       "Allen Holub",
  title =        "Programming {Java} threads in the real world:
                 Threading Architectures",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "9",
  pages =        "??--??",
  month =        sep,
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Sep 10 14:37:36 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.holub.com/goodies/javaworld/jw_index.html;
                 http://www.javaworld.com/javaworld/jw-09-1998/jw-09-threads.htm",
  acknowledgement = ack-nhfb,
}

@Article{Holub:1998:PJTb,
  author =       "Allen Holub",
  title =        "Programming {Java} threads in the real world, {Part}
                 2: Common multithreading Pitfalls (Deadlock, etc.)",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "10",
  pages =        "??--??",
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Mon Jan 4 06:11:43 MST 1999",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.holub.com/goodies/javaworld/jw_index.html;
                 http://www.javaworld.com/javaworld/jw-10-1998/jw-10-toolbox.htm",
  acknowledgement = ack-nhfb,
}

@Article{Holub:1998:PJTc,
  author =       "Allen Holub",
  title =        "Programming {Java} threads in the real world, {Part}
                 3: Semaphore, Lock\_manager, and Mutex",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "11",
  pages =        "??--??",
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Mon Jan 4 06:11:43 MST 1999",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.holub.com/goodies/javaworld/jw_index.html;
                 http://www.javaworld.com/javaworld/jw-11-1998/jw-11-toolbox.htm",
  acknowledgement = ack-nhfb,
}

@Article{Holub:1998:PJTd,
  author =       "Allen Holub",
  title =        "Programming {Java} threads in the real world, {Part}
                 4: Condition Variables and Counting Semaphores",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "12",
  pages =        "??--??",
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Mon Jan 4 06:22:03 MST 1999",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.holub.com/goodies/javaworld/jw_index.html;
                 http://www.javaworld.com/javaworld/jw-12-1998/jw-12-toolbox.htm",
  acknowledgement = ack-nhfb,
}

@PhdThesis{Hopper:1998:CFM,
  author =       "Michael A. Hopper",
  title =        "A compiler framework for multithreaded parallel
                 systems",
  type =         "Thesis ({Ph.D.})",
  school =       "School of Electrical and Computer Engineering, Georgia
                 Institute of Technology",
  address =      "Atlanta, GA, USA",
  pages =        "xii + 110",
  year =         "1998",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "Directed by William Appelbe.",
  keywords =     "Compilers (Computer programs); Parallel processing
                 (Electronic computers)",
}

@Article{Howes:1998:TPC,
  author =       "Brad Howes",
  title =        "Template processing classes for {Python}",
  journal =      j-DDJ,
  volume =       "23",
  number =       "2",
  pages =        "38, 40, 42, 44--46, 48, 100",
  month =        feb,
  year =         "1998",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu May 21 19:02:04 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/dr-dobbs.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Brad shows how you can embed Python objects in HTML
                 pages using boilerplate template processing classes.
                 Then Python creator Guido van Rossum adds a note on
                 what's new in the just-released Python 1.5.",
  acknowledgement = ack-nhfb,
  classification = "C6130D (Document processing techniques); C6130M
                 (Multimedia); C6160J (Object- oriented databases)",
  fjournal =     "Dr. Dobb's Journal of Software Tools",
  keywords =     "application program interfaces; BoilerPlate; CGI
                 infrastructure; conditional control; Emacs; embedded
                 HTML text; errors; HTML document template; HTML
                 editing; hypermedia; iterative control; multithreaded
                 CGI service; object database; object paradigm;
                 object-oriented databases; page description languages;
                 persistent objects; placeholders; print statements;
                 Python; run- time values; run-time HTML generation;
                 syntax coloring; tagged locations; template HTML
                 constructs; template processing classes; text regions",
  treatment =    "P Practical",
}

@Article{Itzkovitz:1998:TMA,
  author =       "Ayal Itzkovitz and Assaf Schuster and Lea Shalev",
  title =        "Thread migration and its applications in distributed
                 shared memory systems",
  journal =      j-J-SYST-SOFTW,
  volume =       "42",
  number =       "1",
  pages =        "71--87",
  month =        jul,
  year =         "1998",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Thu Dec 17 14:07:21 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of systems and software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212",
}

@Article{Ji:1998:PMM,
  author =       "Minwen Ji and Edward W. Felten and Kai Li",
  title =        "Performance measurements for multithreaded programs",
  journal =      j-SIGMETRICS,
  volume =       "26",
  number =       "1",
  pages =        "161--170",
  month =        jun,
  year =         "1998",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/277858.277900",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:25:18 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreaded programming is an effective way to
                 exploit concurrency, but it is difficult to debug and
                 tune a highly threaded program. This paper describes a
                 performance tool called Tmon for monitoring, analyzing
                 and tuning the performance of multithreaded programs.
                 The performance tool has two novel features: it uses
                 `thread waiting time' as a measure and constructs
                 thread waiting graphs to show thread dependencies and
                 thus performance bottlenecks, and it identifies
                 `semi-busy-waiting' points where CPU cycles are wasted
                 in condition checking and context switching. We have
                 implemented the Tmon tool and, as a case study, we have
                 used it to measure and tune a heavily threaded file
                 system. We used four workloads to tune different
                 aspects of the file system. We were able to improve the
                 file system bandwidth and throughput significantly. In
                 one case, we were able to improve the bandwidth by two
                 orders of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@InProceedings{Karamcheti:1998:HLB,
  author =       "Vijay Karamcheti and Andrew A. Chien",
  title =        "A Hierarchical Load-Balancing Framework for Dynamic
                 Multithreaded Computations",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Mar 06 06:31:50 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.supercomp.org/sc98/papers/",
  URL =          "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Karamcheti553/index.htm",
  acknowledgement = ack-nhfb,
}

@Article{Keckler:1998:EFG,
  author =       "Stephen W. Keckler and William J. Dally and Daniel
                 Maskit and Nicholas P. Carter and Andrew Chang and Whay
                 S. Lee",
  title =        "Exploiting fine-grain thread level parallelism on the
                 {MIT} multi-{ALU} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "306--317",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Krinke:1998:SST,
  author =       "Jens Krinke",
  title =        "Static Slicing of Threaded Programs",
  journal =      j-SIGPLAN,
  volume =       "33",
  number =       "7",
  pages =        "35--42",
  month =        jul,
  year =         "1998",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:49 MST 2003",
  bibsource =    "Compendex database; http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Static program slicing is an established method for
                 analyzing sequential programs, especially for program
                 understanding, debugging and testing. Until now, there
                 was no slicing method for threaded programs which
                 handles interference correctly. We present such a
                 method which also calculates more precise static
                 slices. This paper extends the well known structures of
                 the control flow graph and the program dependence graph
                 for threaded programs with interference. This new
                 technique does not require serialization of threaded
                 programs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Technische Universitaet Braunschweig",
  affiliationaddress = "Braunschweig, Ger",
  classification = "723; 723.1; 723.2; 723.5",
  conference =   "Proceedings of the 1998 ACM SIGPLAN\slash SIGSOFT
                 Workshop on Program Analysis for Software Tools and
                 Engineering",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  journalabr =   "ACM SIGPLAN SIGSOFT Workshop Program Anal Software
                 Tools Eng",
  keywords =     "Computer aided software engineering; Computer software
                 selection and evaluation; Control flow graphs; Data
                 flow analysis; Data structures; Program debugging;
                 Static program slicing; Threaded programs",
  meetingaddress = "Montreal, Can",
  meetingdate =  "Jun 16 1998",
  meetingdate2 = "06/16/98",
  sponsor =      "ACM",
}

@Article{Krone:1998:LBN,
  author =       "O. Krone and M. Raab and B. Hirsbrunner",
  title =        "Load Balancing for Network Based Multi-threaded
                 Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1497",
  pages =        "206--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Lewis:1998:MPP,
  author =       "Bil Lewis and Daniel J. Berg",
  title =        "Multithreaded programming with pthreads",
  publisher =    pub-SUN,
  address =      pub-SUN:adr,
  pages =        "xxx + 382",
  year =         "1998",
  ISBN =         "0-13-680729-1 (paperback)",
  ISBN-13 =      "978-0-13-680729-2 (paperback)",
  LCCN =         "QA76.76.T55 L49 1998",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.amazon.com/exec/obidos/ASIN/0136807291/ref=sim_books/002-4892305-5599452;
                 http://www.sun.com/books/catalog/lewis2/index.html",
  acknowledgement = ack-nhfb,
  alttitle =     "Pthreads",
  keywords =     "POSIX (Computer software standard); Threads (Computer
                 programs); UNIX (Computer file)",
}

@Article{Lo:1998:ADW,
  author =       "Jack L. Lo and Luiz Andr{\'e} Barroso and Susan J.
                 Eggers and Kourosh Gharachorloo and Henry M. Levy and
                 Sujay S. Parekh",
  title =        "An analysis of database workload performance on
                 simultaneous multithreaded processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "39--50",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@InProceedings{Lu:1998:ONW,
  author =       "Honghui Lu",
  title =        "{OpenMP} on Networks of Workstations",
  crossref =     "ACM:1998:SHP",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Wed Oct 07 08:50:26 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}

@Article{Manley:1998:GPT,
  author =       "Kevin T. Manley",
  title =        "General-Purpose Threads with {I/O} Completion Ports",
  journal =      j-CCCUJ,
  volume =       "16",
  number =       "4",
  pages =        "??--??",
  month =        apr,
  year =         "1998",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:15 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1998/9804/9804toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Divide and conquer is a good strategy for partitioning
                 a large job, provided you don't divide too much.
                 Windows NT helps you guess right.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Mascarenhas:1998:MTP,
  author =       "Edward Mascarenhas and Vernon Rego",
  title =        "Migrant threads on process farms: parallel programming
                 with {Ariadne}",
  journal =      j-CPE,
  volume =       "10",
  number =       "9",
  pages =        "673--698",
  day =          "10",
  month =        aug,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:42 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=10008703;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=10008703&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{McManis:1998:DUT,
  author =       "Chuck McManis",
  title =        "In Depth: Using threads with collections, {Part 1}",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "3",
  pages =        "??--??",
  month =        mar,
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-03-1998/jw-03-indepth.html",
  acknowledgement = ack-nhfb,
}

@Article{McManis:1998:JDU,
  author =       "Chuck McManis",
  title =        "{Java} In Depth: Using threads with collections, part
                 2",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "6",
  pages =        "??--??",
  month =        jun,
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Aug 13 08:48:26 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-06-1998/jw-06-indepth.html",
  acknowledgement = ack-nhfb,
}

@Article{Nebro:1998:EMR,
  author =       "A. J. Nebro and E. Pimentel and J. M. Troya",
  title =        "Evaluating a Multithreaded Runtime System for
                 Concurrent Object-Oriented Languages",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1505",
  pages =        "167--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Nichols:1998:PP,
  author =       "Bradford Nichols and Dick Buttlar and Jacqueline
                 Proulx Farrell",
  title =        "Pthreads programming",
  publisher =    pub-ORA,
  address =      pub-ORA:adr,
  pages =        "xvi + 267",
  year =         "1998",
  ISBN =         "1-56592-115-1",
  ISBN-13 =      "978-1-56592-115-3",
  LCCN =         "QA76.642 .N53 1998",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Nutshell handbook",
  acknowledgement = ack-nhfb,
  annote =       "A POSIX standard for better multiprocessing.",
  keywords =     "compilers (computer programs); parallel programming
                 (computer science)",
}

@Article{Piumarta:1998:ODT,
  author =       "Ian Piumarta and Fabio Riccardi",
  title =        "Optimizing Direct-threaded Code by Selective
                 Inlining",
  journal =      j-SIGPLAN,
  volume =       "33",
  number =       "5",
  pages =        "291--300",
  month =        may,
  year =         "1998",
  CODEN =        "SINODQ",
  ISBN =         "0-89791-987-4",
  ISBN-13 =      "978-0-89791-987-6",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:47 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html;
                 http://www.cs.virginia.edu/pldi98/program.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p291-piumarta/",
  acknowledgement = ack-nhfb,
  annote =       "Published as part of the Proceedings of PLDI'98.",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "algorithms; experimentation; languages; performance",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Optimization. {\bf D.3.4} Software,
                 PROGRAMMING LANGUAGES, Processors, Interpreters. {\bf
                 D.3.4} Software, PROGRAMMING LANGUAGES, Processors,
                 Translator writing systems and compiler generators.",
}

@Article{Plauger:1998:SCCl,
  author =       "P. J. Plauger",
  title =        "{Standard C/C++}: Thread Safety",
  journal =      j-CCCUJ,
  volume =       "16",
  number =       "12",
  pages =        "??--??",
  month =        dec,
  year =         "1998",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:18 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1998/9812/9812toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The C++ Standard doesn't talk about thread safety, but
                 everyone else does.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Pomerantz:1998:CNS,
  author =       "Dave Pomerantz",
  title =        "{C++} Notifiers: Simplifying system development",
  journal =      j-DDJ,
  volume =       "23",
  number =       "8",
  pages =        "26, 28, 30--31, 89--90",
  month =        aug,
  year =         "1998",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jul 16 13:01:59 MDT 1998",
  bibsource =    "http://www.ddj.com/ddj/1998/1998_08/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/1998/1998_08/notifier.txt;
                 http://www.ddj.com/ftp/1998/1998_08/notifier.zip",
  abstract =     "Notifiers, also called ``events'' or ``messages,'' are
                 used to pass information anonymously between objects.
                 Dave shows how notifiers can work in C++, using a
                 multithreaded application as an example.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Reck:1998:TSR,
  author =       "Bill Reck",
  title =        "Thread Synchronization with Reference-Counting
                 Handles",
  journal =      j-CCCUJ,
  volume =       "16",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "1998",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:14 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1998/9802/9802toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Often, the best time to protect access to a shared
                 object is right when you reach for it.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Reus:1998:VCO,
  author =       "B. Reus and A. Knapp and P. Cenciarelli and M.
                 Wirsing",
  title =        "Verifying a compiler optimization for Multi-Threaded
                 {Java}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1376",
  pages =        "402--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Saghi:1998:MSH,
  author =       "Gene Saghi and Kirk Reinholtz and Paul A. Savory",
  title =        "A Multithreaded Scheduler for a High-speed Spacecraft
                 Simulator",
  journal =      j-SPE,
  volume =       "28",
  number =       "6",
  pages =        "641--656",
  month =        may,
  year =         "1998",
  CODEN =        "SPEXBL",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Thu Jul 29 15:11:48 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/spe.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=1802;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=1802&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Software --- Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
}

@Article{Schmidt:1998:EAM,
  author =       "Douglas C. Schmidt",
  title =        "Evaluating architectures for multithreaded object
                 request brokers",
  journal =      j-CACM,
  volume =       "41",
  number =       "10",
  pages =        "54--60",
  month =        oct,
  year =         "1998",
  CODEN =        "CACMA2",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Tue Oct 6 21:15:42 MDT 1998",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/cacm/1998-41-10/p54-schmidt/",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Seiden:1998:ROM,
  author =       "S. S. Seiden",
  title =        "Randomized Online Multi-threaded Paging",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1432",
  pages =        "264--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shaw:1998:CIP,
  author =       "Andrew Shaw and Arvind and Kyoo-Chan Cho and
                 Christopher Hill and R. Paul Johnson and John
                 Marshall",
  title =        "A Comparison of Implicitly Parallel Multithreaded and
                 Data-Parallel Implementations of an Ocean Model",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "48",
  number =       "1",
  pages =        "1--51",
  day =          "10",
  month =        jan,
  year =         "1998",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1997.1390",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:04 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@PhdThesis{Shaw:1998:CPM,
  author =       "Andrew Shaw",
  title =        "Compiling for parallel multithreaded computation on
                 symmetric multiprocessors",
  type =         "Thesis ({Ph.D.})",
  school =       "Massachusetts Institute of Technology, Department of
                 Electrical Engineering and Computer Science",
  address =      "Cambridge, MA, USA",
  pages =        "149",
  year =         "1998",
  bibdate =      "Fri Aug 7 09:34:36 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Shene:1998:MPI,
  author =       "Chin-Kuang Shene",
  title =        "Multithreaded programming in an introduction to
                 operating systems course",
  journal =      j-SIGCSE,
  volume =       "30",
  number =       "1",
  pages =        "242--246",
  month =        mar,
  year =         "1998",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/274790.274305",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 16:56:29 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse1990.bib",
  abstract =     "This paper presents a way of teaching multithreaded
                 programming as a component in an introduction to
                 operating systems course. Topics include programming
                 assignments, term projects, and experiences. This paper
                 also suggests future work for overcoming a bottleneck
                 that occurs in the current version of this course.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Silc:1998:APC,
  author =       "J. Silc and B. Robic and T. Ungerer",
  title =        "Asynchrony in Parallel Computing: From Dataflow to
                 Multithreading",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "1",
  number =       "1",
  pages =        "??--??",
  month =        "????",
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Fri Dec 19 08:14:11 MST 2003",
  bibsource =    "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1abs.html#silc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Skillicorn:1998:MLP,
  author =       "David B. Skillicorn and Domenico Talia",
  title =        "Models and languages for parallel computation",
  journal =      j-COMP-SURV,
  volume =       "30",
  number =       "2",
  pages =        "123--169",
  month =        jun,
  year =         "1998",
  CODEN =        "CMSVAN",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Fri Sep 11 08:35:51 MDT 1998",
  bibsource =    "http://www.acm.org/pubs/contents/journals/surveys/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/surveys/1998-30-2/p123-skillicorn/",
  abstract =     "We survey parallel programming models and languages
                 using six criteria to assess their suitability for
                 realistic portable parallel programming. We argue that
                 an ideal model should by easy to program, should have a
                 software development methodology, should be
                 architecture-independent, should be easy to understand,
                 should guarantee performance, and should provide
                 accurate information about the cost of programs. These
                 criteria reflect our belief that developments in
                 parallelism must be driven by a parallel software
                 industry based on portability and efficiency. We
                 consider programming models in six categories,
                 depending on the level of abstraction they provide.
                 Those that are very abstract conceal even the presence
                 of parallelism at the software level. Such models make
                 software easy to build and port, but efficient and
                 predictable performance is usually hard to achieve. At
                 the other end of the spectrum, low-level models make
                 all of the messy issues of parallel programming
                 explicit (how many threads, how to place them, how to
                 express communication, and how to schedule
                 communication), so that software is hard to build and
                 not very portable, but is usually efficient. Most
                 recent models are near the center of this spectrum,
                 exploring the best tradeoffs between expressiveness and
                 performance. A few models have achieved both
                 abstractness and efficiency. Both kinds of models raise
                 the possibility of parallelism as part of the
                 mainstream of computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
  keywords =     "languages; performance; theory",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS. {\bf D.1} Software, PROGRAMMING TECHNIQUES.
                 {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications.",
}

@InProceedings{Smith:1998:SIF,
  author =       "Geoffrey Smith and Dennis Volpano",
  title =        "Secure information flow in a multi-threaded imperative
                 language",
  crossref =     "ACM:1998:CRP",
  pages =        "355--364",
  year =         "1998",
  bibdate =      "Mon May 3 12:57:52 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/plan/268946/p355-smith/",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; languages; security; theory",
  subject =      "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS
                 OF PROGRAMS, Studies of Program Constructs, Type
                 structure. {\bf D.3.0} Software, PROGRAMMING LANGUAGES,
                 General. {\bf D.2.0} Software, SOFTWARE ENGINEERING,
                 General, Protection mechanisms. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming.",
}

@Article{Tennberg:1998:CAD,
  author =       "Patrick Tennberg",
  title =        "Creating Active Data Types via Multithreading",
  journal =      j-CCCUJ,
  volume =       "16",
  number =       "1",
  pages =        "??--??",
  month =        jan,
  year =         "1998",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:13 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1998/9801/9801toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "If you need multiple active agents in a program, you
                 need multiple threads to synchronize them.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Thitikamol:1998:PNM,
  author =       "K. Thitikamol and P. Keleher",
  title =        "Per-node multithreading and remote latency",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "47",
  number =       "4",
  pages =        "414--426",
  month =        apr,
  year =         "1998",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.675711",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Jul 6 09:35:54 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=675711",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@InProceedings{Thornley:1998:SSH,
  author =       "John Thornley and K. Mani Chandy and Hiroshi Ishii",
  title =        "A System for Structured High-Performance Multithreaded
                 Programming in {Windows NT}",
  crossref =     "USENIX:1998:PUWa",
  pages =        "??--??",
  year =         "1998",
  bibdate =      "Fri Oct 18 07:49:55 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/usenix-nt98/thornley.html;
                 http://www.usenix.org/publications/library/proceedings/usenix-nt98/thornley_slides",
  acknowledgement = ack-nhfb,
}

@Article{Tsai:1998:POC,
  author =       "J.-Y. Tsai and Z. Jiang and P.-C. Yew",
  title =        "Program Optimization for Concurrent Multithreaded
                 Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1366",
  pages =        "146--??",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Tullsen:1998:RSM,
  author =       "Dean M. Tullsen and Susan J. Eggers and Henry M.
                 Levy",
  title =        "Retrospective: {Simultaneous} multithreading:
                 maximizing on-chip parallelism",
  crossref =     "ACM:1998:PAI",
  pages =        "115--116",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Tullsen:1998:SMM,
  author =       "Dean M. Tullsen and Susan J. Eggers and Henry M.
                 Levy",
  title =        "Simultaneous multithreading: maximizing on-chip
                 parallelism",
  crossref =     "ACM:1998:PAI",
  pages =        "533--544",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@Article{Venners:1998:DTS,
  author =       "Bill Venners",
  title =        "Design for thread safety",
  journal =      j-JAVAWORLD,
  volume =       "3",
  number =       "8",
  pages =        "??--??",
  month =        aug,
  year =         "1998",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Sep 10 14:37:30 MDT 1998",
  bibsource =    "http://www.javaworld.com/javaworld/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.javaworld.com/javaworld/jw-08-1998/jw-08-techniques.htm",
  acknowledgement = ack-nhfb,
}

@InProceedings{Vishkin:1998:EMT,
  author =       "Uzi Vishkin and Shlomit Dascal and Efraim Berkovich
                 and Joseph Nuzman",
  booktitle =    "SPAA '98: 10th Annual ACM Symposium on Parallel
                 Algorithms and Architectures, June 28--July 2, 1998,
                 Puerto Vallarta, Mexico",
  title =        "Explicit multi-threading ({XMT}) bridging models for
                 instruction parallelism (extended abstract)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  year =         "1998",
  DOI =          "https://doi.org/10.1145.277680",
  ISBN =         "0-89791-989-0",
  ISBN-13 =      "978-0-89791-989-0",
  LCCN =         "QA76.58 .A26 1998",
  bibdate =      "Fri Jul 27 05:37:45 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM order number 417980.",
  URL =          "http://delivery.acm.org/10.1145/280000/277680/p140-vishkin.pdf",
  acknowledgement = ack-nhfb,
  bookpages =    "viii + 310",
  keywords =     "IA-64",
}

@Article{Wallace:1998:TMP,
  author =       "Steven Wallace and Brad Calder and Dean M. Tullsen",
  title =        "Threaded multiple path execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "238--249",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@PhdThesis{Weissman:1998:ATT,
  author =       "Boris Weissman",
  title =        "Active threads: towards efficient fine-grained
                 parallelism in object-oriented systems",
  type =         "Thesis ({Ph.D. in Computer Science})",
  school =       "Department of Computer Science, University of
                 California, Berkeley",
  address =      "Berkeley, CA, USA",
  year =         "1998",
  LCCN =         "T7.6.1998 W457",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "dissertations, academic -- UCB -- Computer Science --
                 1991--2000; University of California, Berkeley, Dept.
                 Of Computer Science -- dissertations",
}

@Article{Weissman:1998:PCS,
  author =       "Boris Weissman",
  title =        "Performance Counters and State Sharing Annotations: a
                 Unified Approach to Thread Locality",
  journal =      j-SIGPLAN,
  volume =       "33",
  number =       "11",
  pages =        "127--138",
  month =        nov,
  year =         "1998",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:17:54 MST 2003",
  bibsource =    "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Co-published in {\em Operating Systems Review}, {\bf
                 32}(5).",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/asplos/291069/p127-weissman/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "design; experimentation; measurement; performance;
                 theory",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf F.1.2} Theory of
                 Computation, COMPUTATION BY ABSTRACT DEVICES, Modes of
                 Computation, Parallelism and concurrency. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Simulation.
                 {\bf G.3} Mathematics of Computing, PROBABILITY AND
                 STATISTICS, Markov processes.",
}

@Article{Wilde:1998:RES,
  author =       "Norman Wilde and Christopher Casey and Joe Vandeville
                 and Gary Trio and Dick Hotz",
  title =        "Reverse engineering of software threads: a design
                 recovery technique for large multi-process systems",
  journal =      j-J-SYST-SOFTW,
  volume =       "43",
  number =       "1",
  pages =        "11--17",
  month =        oct,
  year =         "1998",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Wed Dec 16 08:24:49 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of systems and software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212",
}

@Article{Wilmot:1998:DTM,
  author =       "Dick Wilmot",
  title =        "Data threaded microarchitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "5",
  pages =        "22--32",
  month =        dec,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Zhou:1998:LST,
  author =       "Honbo Zhou and Al Geist",
  title =        "{LPVM}: a step towards multithread {PVM}",
  journal =      j-CPE,
  volume =       "10",
  number =       "5",
  pages =        "407--416",
  day =          "25",
  month =        apr,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  ISSN-L =       "1040-3108",
  bibdate =      "Tue Sep 7 06:06:40 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5385;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=5385&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency, practice and experience",
}

@Article{Anonymous:1999:BST,
  author =       "Anonymous",
  title =        "Bookshelf: Surviving the Top Ten Challenges of
                 Software Development; The {Year 2000} Crisis; The
                 Continuing Challenge; Software Project Survival Guide;
                 Object-Oriented Multithreading Using {C++}",
  journal =      j-IEEE-SOFTWARE,
  volume =       "16",
  number =       "1",
  pages =        "114--??",
  month =        jan # "\slash " # feb,
  year =         "1999",
  CODEN =        "IESOEG",
  ISSN =         "0740-7459 (print), 0740-7459 (electronic)",
  ISSN-L =       "0740-7459",
  bibdate =      "Thu Apr 1 16:52:57 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/so/books/so1999/pdf/s1114.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Software",
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/software",
}

@Article{Antoniu:1999:ETT,
  author =       "G. Antoniu and L. Bouge and R. Namyst",
  title =        "An Efficient and Transparent Thread Migration Scheme
                 in the {PM2} Runtime System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1586",
  pages =        "496--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Azagury:1999:NIR,
  author =       "Alain Azagury and Elliot K. Kolodner and Erez
                 Petrank",
  title =        "A Note on the Implementation of Replication-Based
                 Garbage Collection for Multithreaded Applications and
                 Multiprocessor Environments",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "9",
  number =       "3",
  pages =        "391--??",
  month =        sep,
  year =         "1999",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Jan 6 12:02:35 MST 2005",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Blumofe:1999:SMC,
  author =       "Robert D. Blumofe and Charles E. Leiserson",
  title =        "Scheduling multithreaded computations by work
                 stealing",
  journal =      j-J-ACM,
  volume =       "46",
  number =       "5",
  pages =        "720--748",
  month =        sep,
  year =         "1999",
  CODEN =        "JACOAH",
  ISSN =         "0004-5411 (print), 1557-735X (electronic)",
  ISSN-L =       "0004-5411",
  bibdate =      "Sun Jan 23 12:19:49 MST 2000",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/jacm/1999-46-5/p720-blumofe/",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J401",
}

@Article{Bouge:1999:ECM,
  author =       "L. Bouge and J.-F. Mehaut and R. Namyst",
  title =        "Efficient Communications in Multithreaded Runtime
                 Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1586",
  pages =        "468--482",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Mar 16 07:33:54 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Broadman:1999:ECM,
  author =       "Allen Broadman and Eric Shaw",
  title =        "Executing a Class Member in Its Own Thread",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "12",
  pages =        "??--??",
  month =        dec,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:24 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9912/9912toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Creating a separate thread to execute a member
                 function call is a messy business that's often
                 necessary. It's a task well worth encapsulating.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Cappello:1999:PNB,
  author =       "F. Cappello and O. Richard and D. Etiemble",
  title =        "Performance of the {NAS} Benchmarks on a Cluster of
                 {SMP PCs} Using a Parallelization of the {MPI} Programs
                 with {OpenMP}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1662",
  pages =        "339--350",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cenciarelli:1999:EBS,
  author =       "P. Cenciarelli and A. Knapp and B. Reus and M.
                 Wirsing",
  title =        "An Event-Based Structural Operational Semantics of
                 Multi-Threaded {Java}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1523",
  pages =        "157--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chappell:1999:SSM,
  author =       "Robert S. Chappell and Jared Stark and Sangwook P. Kim
                 and Steven K. Reinhardt and Yale N. Patt",
  title =        "Simultaneous subordinate microthreading {(SSMT)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "186--195",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Dascal:1999:ELR,
  author =       "Shlomit Dascal and Uzi Vishkin",
  title =        "Experiments with List Ranking for Explicit
                 Multi-Threaded {(XMT)} Instruction Parallelism
                 (Extended Abstract)",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1668",
  pages =        "43--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Feb 4 12:03:08 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1668.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1668/16680043.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1668/16680043.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{delaPuente:1999:RTP,
  author =       "Juan A. de la Puente and Jos{\'e} F. Ruiz and
                 Jes{\'u}s M. Gonz{\'a}lez-Barahona",
  title =        "Real-Time Programming with {GNAT}: Specialized Kernels
                 versus {POSIX} Threads",
  journal =      j-SIGADA-LETTERS,
  volume =       "19",
  number =       "2",
  pages =        "73--77",
  month =        jun,
  year =         "1999",
  CODEN =        "AALEE5",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Tue Aug 31 07:04:20 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGADA Ada Letters",
}

@Article{DeWitt:1999:PTL,
  author =       "Anthony DeWitt and Thomas Gross",
  title =        "The potential of thread-level speculation based on
                 value profiling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "22--22",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Duda:1999:BVT,
  author =       "Kenneth J. Duda and David R. Cheriton",
  title =        "Borrowed-virtual-time {(BVT)} scheduling: supporting
                 latency-sensitive threads in a general-purpose
                 scheduler",
  journal =      j-OPER-SYS-REV,
  volume =       "33",
  number =       "5",
  pages =        "261--276",
  month =        dec,
  year =         "1999",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@InProceedings{Garcia:1999:MMI,
  author =       "F. Garcia and A. Calderon and J. Carretero",
  title =        "{MiMPI}: a multithread-safe implementation of
                 {MPI}",
  crossref =     "Dongarra:1999:RAP",
  number =       "1697",
  pages =        "207--214",
  year =         "1999",
  bibdate =      "Thu Dec 9 06:08:35 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Greiner:1999:PTE,
  author =       "John Greiner and Guy E. Blelloch",
  title =        "A provably time-efficient parallel implementation of
                 full speculation",
  journal =      j-TOPLAS,
  volume =       "21",
  number =       "2",
  pages =        "240--285",
  month =        mar,
  year =         "1999",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Sep 26 10:12:58 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/toplas/1999-21-2/p240-greiner/",
  abstract =     "Speculative evaluation, including leniency and
                 futures, is often used to produce high degrees of
                 parallelism. Understanding the performance
                 characteristics of such evaluation, however, requires
                 having a detailed understanding of the implementation.
                 For example, the particular implementation technique
                 used to suspend and reactivate threads can have an
                 asymptotic effect on performance. With the goal of
                 giving the users some understanding of performance
                 without requiring them to understand the
                 implementation, we present a provable implementation
                 bound for a language based on speculative evaluation.
                 The idea is (1) to supply the users with a semantics
                 for a language that defines abstract costs for
                 measuring or analyzing the performance of computations,
                 (2) to supply the users with a mapping of these costs
                 onto runtimes on various machine models, and (3) to
                 describe an implementation strategy of the language and
                 prove that it meets these mappings. For this purpose we
                 consider a simple language based on speculative
                 evaluation. For every computation, the semantics of the
                 language returns a directed acyclic graph (DAG) in
                 which each node represents a unit of computation, and
                 each edge represents a dependence. We then describe an
                 implementation strategy of the language and show that
                 any computation with $w$ work (the number of nodes in
                 the DAG) and $d$ depth (the length of the longest path
                 in the DAG) will run on a $p$-processor PRAM in $ O(w /
                 p + d \log p) $ time. The bounds are work efficient
                 (within a constant factor of linear speedup) when there
                 is sufficient parallelism, $ w / d p \log p $. These
                 are the first time bounds we know of for languages with
                 speculative evaluation. The main challenge is in
                 parallelizing the necessary queuing operations on
                 suspended threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  generalterms = "Languages; Performance; Theory",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "abstract machines; parallel languages; profiling
                 semantics; speculation; threads",
  subject =      "Software --- Software Engineering --- Metrics (D.2.8);
                 Software --- Programming Languages --- Language
                 Classifications (D.3.2): {\bf Data-flow languages};
                 Software --- Programming Languages --- Language
                 Classifications (D.3.2); Theory of Computation ---
                 Computation by Abstract Devices --- Modes of
                 Computation (F.1.2): {\bf Parallelism and concurrency};
                 Theory of Computation --- Computation by Abstract
                 Devices --- Modes of Computation (F.1.2); Theory of
                 Computation --- Logics and Meanings of Programs ---
                 Specifying and Verifying and Reasoning about Programs
                 (F.3.1)",
}

@Article{Gu:1999:EJT,
  author =       "Yan Gu and B. S. Lee and Wentong Cai",
  title =        "Evaluation of {Java} thread performance on two
                 different multithreaded kernels",
  journal =      j-OPER-SYS-REV,
  volume =       "33",
  number =       "1",
  pages =        "34--46",
  month =        jan,
  year =         "1999",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Harrington:1999:WMM,
  author =       "John Harrington",
  title =        "{Win32} Multithreading Made Easy",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "8",
  pages =        "48, 50--52, 54--56",
  month =        aug,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:22 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9908/9908toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreading logic is hard to write and hard to
                 maintain. So keep it simple and separate.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Holub:1999:PJTa,
  author =       "Allen Holub",
  title =        "Programming {Java} threads in the real world, {Part}
                 5: Timers",
  journal =      j-JAVAWORLD,
  volume =       "4",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "1999",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Mar 04 12:56:16 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.holub.com/goodies/javaworld/jw_index.html",
  acknowledgement = ack-nhfb,
}

@Article{Holub:1999:PJTb,
  author =       "Allen Holub",
  title =        "Programming {Java} threads in the real world, {Part}
                 6: {Mach '99}: Observer and the Mysteries of the
                 {AWTEventMulticaster}",
  journal =      j-JAVAWORLD,
  volume =       "4",
  number =       "3",
  pages =        "??--??",
  month =        mar,
  year =         "1999",
  CODEN =        "????",
  ISSN =         "1091-8906",
  bibdate =      "Thu Mar 04 12:56:16 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.holub.com/goodies/javaworld/jw_index.html",
  acknowledgement = ack-nhfb,
}

@Article{Jonsson:1999:NPS,
  author =       "J. Jonsson and H. Loenn and K. G. Shin",
  title =        "Non-preemptive Scheduling of Real-Time Threads on
                 Multi-Level-Context Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1586",
  pages =        "363--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Karamcheti:1999:ASM,
  author =       "Vijay Karamcheti and Andrew A. Chien",
  title =        "Architectural Support and Mechanisms for Object
                 Caching in Dynamic Multithreaded Computations",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "58",
  number =       "2",
  pages =        "260--300",
  month =        aug,
  year =         "1999",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1999.1555",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:08 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Kekckler:1999:CEH,
  author =       "S. W. Kekckler and A. Chang and W. S. L. S. Chatterjee
                 and W. J. Dally",
  title =        "Concurrent event handling through multithreading",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "48",
  number =       "9",
  pages =        "903--916",
  month =        sep,
  year =         "1999",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.795220",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Jul 6 08:46:59 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795220",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Krishnan:1999:CMA,
  author =       "V. Krishnan and J. Torrellas",
  title =        "A chip-multiprocessor architecture with speculative
                 multithreading",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "48",
  number =       "9",
  pages =        "866--880",
  month =        sep,
  year =         "1999",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.795218",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Jul 6 08:46:59 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795218",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Kusakabe:1999:INS,
  author =       "S. Kusakabe and K. Inenaga and M. Amamiya and X.
                 Tang",
  title =        "Implementing a Non-strict Functional Programming
                 Language on a Threaded Architecture",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1586",
  pages =        "138--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kwak:1999:EMC,
  author =       "H. Kwak and B. Lee and A. R. Hurson and Suk-Han Yoon
                 and Woo-Jong Hahn",
  title =        "Effects of multithreading on cache performance",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "48",
  number =       "2",
  pages =        "176--184",
  month =        feb,
  year =         "1999",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.752659",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Jul 6 08:46:56 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=752659",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Lo:1999:SDR,
  author =       "J. L. Lo and S. S. Parekh and S. J. Eggers and H. M.
                 Levy and D. M. Tullsen",
  title =        "Software-Directed Register Deallocation for
                 Simultaneous Multithreaded Processors",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "10",
  number =       "9",
  pages =        "922--??",
  month =        sep,
  year =         "1999",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Oct 12 18:48:31 MDT 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/td/books/td1999/pdf/l0922.pdf;
                 http://www.computer.org/tpds/td1999/l0922abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Lo:1999:TCO,
  author =       "Jack L. Lo and Susan J. Eggers and Henry M. Levy and
                 Sujay S. Parekh and Dean M. Tullsen",
  title =        "Tuning Compiler Optimizations for Simultaneous
                 Multithreading",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "27",
  number =       "6",
  pages =        "477--503",
  month =        dec,
  year =         "1999",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1023/A:1018780200739",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 6 16:39:54 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=27&issue=6;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 OCLC Contents1st database",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=27&issue=6&spage=477",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  remark =       "Special Issue: {30th Annual ACM\slash IEEE
                 International Symposium on Microarchitecture}, Part
                 {II}.",
}

@Article{Lundberg:1999:PBS,
  author =       "Lars Lundberg",
  title =        "Predicting and Bounding the Speedup of Multithreaded
                 {Solaris} Programs",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "57",
  number =       "3",
  pages =        "322--333",
  month =        jun,
  year =         "1999",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1999.1536",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:07 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Manley:1999:IPT,
  author =       "Kevin Manley",
  title =        "Improving Performance with Thread-Private Heaps",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "9",
  pages =        "50--??",
  month =        sep,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:22 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9909/9909toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Threads interact in the darndest ways, but conflicts
                 with a common heap are particularly pernicious. Luckily
                 they can be avoided.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Marcuello:1999:EST,
  author =       "P. Marcuello and A. Gonzalez",
  title =        "Exploiting Speculative Thread-Level Parallelism on a
                 {SMT} Processor",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1593",
  pages =        "754--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Masney:1999:IMT,
  author =       "Brian Masney",
  title =        "Introduction to Multi-Threaded Programming",
  journal =      j-LINUX-J,
  volume =       "61",
  pages =        "??--??",
  month =        may,
  year =         "1999",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Thu Jun 3 06:34:02 MDT 1999",
  bibsource =    "http://www.linuxjournal.com/issue61/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A description of thread programming basics.",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Mendelson:1999:DAM,
  author =       "Avi Mendelson and Michael Bekerman",
  title =        "Design Alternatives of Multithreaded Architecture",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "27",
  number =       "3",
  pages =        "161--193",
  month =        jun,
  year =         "1999",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1023/A:1018733528538",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 6 16:39:53 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=27&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 OCLC Contents1st database",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=27&issue=3&spage=161",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@InProceedings{Mitchell:1999:ILP,
  author =       "Nicholas Mitchell and Larry Carter and Jeanne Ferrante
                 and Dean Tullsen",
  title =        "Instruction-level Parallelism vs. Thread-level
                 Parallelism on Simultaneous Multi-threading
                 Processors",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@Article{Moody:1999:STT,
  author =       "Scott Arthur Moody and Samuel Kwok and Dale Karr",
  title =        "{SimpleGraphics}: {Tcl\slash Tk} visualization of
                 real-time multi-threaded and distributed applications",
  journal =      j-SIGADA-LETTERS,
  volume =       "19",
  number =       "2",
  pages =        "60--66",
  month =        jun,
  year =         "1999",
  CODEN =        "AALEE5",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Sat Aug 9 09:06:06 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGAda Ada Letters",
}

@Article{Narlikar:1999:SES,
  author =       "Girija J. Narlikar and Guy E. Blelloch",
  title =        "Space-Efficient Scheduling of Nested Parallelism",
  journal =      j-TOPLAS,
  volume =       "21",
  number =       "1",
  pages =        "138--173",
  month =        jan,
  year =         "1999",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Sep 26 10:12:58 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/toplas/1999-21-1/p138-narlikar/",
  abstract =     "Many of today's high-level parallel languages support
                 dynamic, fine-grained parallelism. These languages
                 allow the user to expose all the parallelism in the
                 program, which is typically of a much higher degree
                 than the number of processors. Hence an efficient
                 scheduling algorithm is required to assign computations
                 to processors at runtime. Besides having low overheads
                 and good load balancing, it is important for the
                 scheduling algorithm to minimize the space usage of the
                 parallel program. This article presents an on-line
                 scheduling algorithm that is provably space efficient
                 and time efficient for nested-parallel languages. For a
                 computation with depth $D$ and serial space requirement
                 $ S_1 $, the algorithm generates a schedule that
                 requires at most $ S_1 + O(K \cdot D \cdot p) $ space
                 (including scheduler space) on $p$ processors. Here,
                 $K$ is a user-adjustable runtime parameter specifying
                 the net amount of memory that a thread may allocate
                 before it is preempted by the scheduler. Adjusting the
                 value of $K$ provides a trade-off between the running
                 time and the memory requirement of a parallel
                 computation. To allow the scheduler to scale with the
                 number of processors we also parallelize the scheduler
                 and analyze the space and time bounds of the
                 computation to include scheduling costs. In addition to
                 showing that the scheduling algorithm is space and time
                 efficient in theory, we demonstrate that it is
                 effective in practice. We have implemented a runtime
                 system that uses our algorithm to schedule lightweight
                 parallel threads. The results of executing parallel
                 programs on this system show that our scheduling
                 algorithm significantly reduces memory usage compared
                 to previous techniques, without compromising
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  generalterms = "Algorithms; Languages; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "dynamic scheduling; multithreading; nested
                 parallelism; parallel language implementation; space
                 efficiency",
  subject =      "Software --- Programming Techniques --- Concurrent
                 Programming (D.1.3): {\bf Parallel programming};
                 Software --- Programming Languages --- Processors
                 (D.3.4): {\bf Run-time environments}; Theory of
                 Computation --- Analysis of Algorithms and Problem
                 Complexity --- General (F.2.0)",
}

@Article{Nemeth:1999:MLK,
  author =       "Z. Nemeth and H. Tomiyasu and P. Kacsuk and M.
                 Amamiya",
  title =        "Multithreaded {LOGFLOW} on {KUMP\slash} {D}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1615",
  pages =        "320--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Nevison:1999:SSC,
  author =       "Christopher H. Nevison",
  title =        "Seminar: safe concurrent programming in {Java} with
                 {CSP}",
  journal =      j-SIGCSE,
  volume =       "31",
  number =       "1",
  pages =        "367",
  month =        mar,
  year =         "1999",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/384266.299817",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 16:56:36 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse1990.bib",
  abstract =     "We present methods for safe and correct programming
                 for concurrent threads in Java. The methods are based
                 on the principles of Concurrent Sequential Processes
                 (CSP). We demonstrate the use of tools which provide
                 the structure of CSP within Java to avoid some of the
                 pitfalls of multithreaded programming using monitors,
                 the primitive synchronization tool in Java. Several
                 examples illustrate the use of these tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Book{Oaks:1999:JT,
  author =       "Scott Oaks and Henry Wong",
  title =        "{Java} threads",
  publisher =    pub-ORA,
  address =      pub-ORA:adr,
  edition =      "Second",
  pages =        "xiii + 319",
  year =         "1999",
  ISBN =         "1-56592-418-5",
  ISBN-13 =      "978-1-56592-418-5",
  LCCN =         "QA76.73.J38 O25 1999",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Java series",
  acknowledgement = ack-nhfb,
  keywords =     "Java (computer program language); threads (computer
                 programs)",
}

@Article{Pant:1999:TCP,
  author =       "Lalit Pant",
  title =        "Thread Communication In Parallel Algorithms: Enabling
                 efficient interaction between threads",
  journal =      j-DDJ,
  volume =       "24",
  number =       "4",
  pages =        "32, 34, 36, 38--39",
  month =        apr,
  year =         "1999",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Wed Mar 3 06:30:11 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/1999/1999_04/parallel.txt",
  abstract =     "With the increasing availability of multiprocessing
                 hardware, thread-based parallel algorithms are becoming
                 more and more important. Lalit presents thread
                 communication mechanisms for use within parallel
                 algorithms. Additional resources include parallel.txt
                 (listings).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Book{Pham:1999:MPW,
  author =       "Thuan Q. Pham and Pankaj K. Garg",
  title =        "Multithreaded Programming with {Win32}",
  publisher =    pub-PHPTR,
  address =      pub-PHPTR:adr,
  pages =        "xix + 219",
  year =         "1999",
  ISBN =         "0-13-010912-6",
  ISBN-13 =      "978-0-13-010912-5",
  LCCN =         "QA76.642.P518 1998",
  bibdate =      "Thu Jan 21 18:58:23 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Includes CD-ROM.",
  URL =          "http://www.phptr.com/ptrbooks/ptr_0130109126.html",
  acknowledgement = ack-nhfb,
  publishersnote = "If you want to deliver NT applications with maximum
                 performance, efficiency and robustness, you need to
                 master multithreading. Multithreaded Programming with
                 Win32 brings together every Win32 multithreading
                 technique and concept you must know --- all brilliantly
                 explained with practical examples and sample code.",
  xxnote =       "Check pages and year??",
}

@Article{Plauger:1999:SCCg,
  author =       "P. J. Plauger",
  title =        "{Standard C/C++}: a Better Red-Black Tree",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "7",
  pages =        "10--??",
  month =        jul,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:21 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9907/9907toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The C++ Standard is silent about issues such as thread
                 safety and DLL safety, but customers and reviewers
                 certainly aren't.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Richards:1999:ALT,
  author =       "Etienne Richards",
  title =        "Adding Level-2 Thread Safety to Existing Objects",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:19 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9902/9902toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The code required to share an object among multiple
                 threads is tedious and error prone. But it can be
                 neatly encapsulated.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Ringle:1999:SCT,
  author =       "Jonathan Ringle",
  title =        "Singleton Creation the Thread-safe Way",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "10",
  pages =        "??--??",
  month =        oct,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:23 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9910/9910toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Singletons avoid problems with order of construction,
                 at the cost of more problems for multithreading.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Rodgers:1999:TSN,
  author =       "Jeremy B. Rodgers and Rhonda Kay Gaede and Jeffrey H.
                 Kulick",
  title =        "{IN-Tune}: an {In-Situ} non-invasive performance
                 tuning tool for multi-threaded {Linux} on symmetric
                 multiprocessing {Pentium} workstations",
  journal =      j-SPE,
  volume =       "29",
  number =       "9",
  pages =        "775--792",
  day =          "25",
  month =        jul,
  year =         "1999",
  CODEN =        "SPEXBL",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Thu Jul 29 15:12:27 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=62501865;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=62501865&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
}

@TechReport{Roe:1999:PMI,
  author =       "Kevin Roe and Piyush Mehrotra",
  title =        "Parallelization of a multigrid incompressible viscous
                 cavity flow solver using {openMP}",
  type =         "{NASA} contractor report",
  number =       "NASA\slash CR-1999-209551",
  institution =  inst-NLRC,
  address =      inst-NLRC:adr,
  pages =        "????",
  year =         "1999",
  bibdate =      "Thu Mar 16 07:20:02 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Also ICASE report 99-36.",
  acknowledgement = ack-nhfb,
}

@Article{Ronsse:1999:RFI,
  author =       "Michiel Ronsse and Koen {De Bosschere}",
  title =        "{RecPlay}: a fully integrated practical record\slash
                 replay system",
  journal =      j-TOCS,
  volume =       "17",
  number =       "2",
  pages =        "133--152",
  month =        may,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p133-ronsse/",
  abstract =     "This article presents a practical solution for the
                 cyclic debugging of nondeterministic parallel programs.
                 The solution consists of a combination of record\slash
                 replay with automatic on-the-fly data race detection.
                 This combination enables us to limit the record phase
                 to the more efficient recording of the synchronization
                 operations, while deferring the time-consuming data
                 race detection to the replay phase. As the record phase
                 is highly efficient, there is no need to switch it off,
                 hereby eliminating the possibility of Heisenbugs
                 because tracing can be left on all the time. This
                 article describes an implementation of the tools needed
                 to support RecPlay.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  generalterms = "Algorithms; Experimentation; Reliability",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "binary code modification; multithreaded programming;
                 race detection",
  subject =      "Software --- Programming Techniques --- Concurrent
                 Programming (D.1.3): {\bf Parallel programming};
                 Software --- Software Engineering --- Testing and
                 Debugging (D.2.5): {\bf Debugging aids}; Software ---
                 Software Engineering --- Testing and Debugging (D.2.5):
                 {\bf Monitors}; Software --- Software Engineering ---
                 Testing and Debugging (D.2.5): {\bf Tracing}; Software
                 --- Operating Systems --- Process Management (D.4.1):
                 {\bf Concurrency}; Software --- Operating Systems ---
                 Process Management (D.4.1): {\bf Deadlocks}; Software
                 --- Operating Systems --- Process Management (D.4.1):
                 {\bf Multiprocessing/multiprogramming/multitasking};
                 Software --- Operating Systems --- Process Management
                 (D.4.1): {\bf Mutual exclusion}; Software --- Operating
                 Systems --- Process Management (D.4.1): {\bf
                 Synchronization}",
}

@Article{Rugina:1999:PAM,
  author =       "Radu Rugina and Martin Rinard",
  title =        "Pointer Analysis for Multithreaded Programs",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "5",
  pages =        "77--90",
  month =        may,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:03 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html;
                 http://www.acm.org/pubs/contents/proceedings/pldi/301618/index.html;
                 http://www.cs.rutgers.edu/pldi99/program.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "See PLDI'99 proceedings \cite{ACM:1999:PASa}.",
  URL =          "http://www.acm.org:80/pubs/citations/proceedings/pldi/301122/p77-rugina/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Saito:1999:MRS,
  author =       "H. Saito and N. Stavrakos and C. Polychronopoulos",
  title =        "Multithreading Runtime Support for Loop and Functional
                 Parallelism",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1615",
  pages =        "133--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@MastersThesis{Samorodin:1999:SFS,
  author =       "Steven Howard Samorodin",
  title =        "Supporting flexible safety and sharing in
                 multi-threaded environments",
  type =         "Thesis ({M.S.})",
  school =       "Computer Science Department, University of California,
                 Davis",
  address =      "Davis, CA, USA",
  pages =        "39",
  year =         "1999",
  bibdate =      "Sat Apr 20 11:17:26 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Scherer:1999:TAP,
  author =       "Alex Scherer and Honghui Lu and Thomas Gross and Willy
                 Zwaenepoel",
  title =        "Transparent adaptive parallelism on {NOWs} using
                 {OpenMP}",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "96--106",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p96-scherer/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Seiden:1999:ROM,
  author =       "Steven S. Seiden",
  title =        "Randomized Online Multi-Threaded Paging",
  journal =      j-NORDIC-J-COMPUT,
  volume =       "6",
  number =       "2",
  pages =        "148--??",
  month =        "Summer",
  year =         "1999",
  CODEN =        "NJCOFR",
  ISSN =         "1236-6064",
  bibdate =      "Fri Oct 13 05:25:14 MDT 2000",
  bibsource =    "http://www.cs.helsinki.fi/njc/njc6.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.cs.helsinki.fi/njc/References/seiden1999:148.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Nordic Journal of Computing",
}

@InProceedings{Shen:1999:ATL,
  author =       "Kai Shen and Hong Tang and Tao Yang",
  title =        "Adaptive Two-level Thread Management for Fast {MPI}
                 Execution on Shared Memory Machines",
  crossref =     "ACM:1999:SPO",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Thu Feb 24 09:02:57 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sc99.org/techpapers/",
  acknowledgement = ack-nhfb,
}

@Article{Sinharoy:1999:COI,
  author =       "Balaram Sinharoy",
  title =        "Compiler optimization to improve data locality for
                 processor multithreading",
  journal =      j-SCI-PROG,
  volume =       "7",
  number =       "1",
  pages =        "21--37",
  month =        "????",
  year =         "1999",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Thu Mar 28 12:27:27 MST 2002",
  bibsource =    "Compendex database;
                 http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 OCLC Article1st database",
  URL =          "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=64cr5a4mg33tuhcbdr02%26referrer=parent%26backto=issue%2C2%2C7%3Bjournal%2C8%2C9%3Blinkingpublicationresults%2C1%2C1",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Storino:1999:MTB,
  author =       "Salvatore Storino and John M. Borkenhagen and Ronald
                 N. Kalla and Steven R. Kunkel",
  title =        "A Multi-Threaded 64-bit {PowerPC} Commercial {RISC}
                 Processor Design",
  crossref =     "IEEE:1999:HCS",
  pages =        "??--??",
  year =         "1999",
  bibdate =      "Mon Jan 08 05:28:04 2001",
  bibsource =    "ftp://www.hotchips.org//pub/hotc7to11cd/hc99/hc11_pdf/hc99.s1.1.Storino.txt;
                 http://www.hotchips.org/hotc11_monday.html;
                 https://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Sutter:1999:OAM,
  author =       "Herb Sutter",
  title =        "Optimizations That Aren't (In a Multithreaded World)",
  journal =      j-CCCUJ,
  volume =       "17",
  number =       "6",
  pages =        "??--??",
  month =        jun,
  year =         "1999",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:21 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/1999/9906/9906toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "An ``obvious'' optimization can really lose ground
                 when thread safety has to be ensured as well.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@InProceedings{Tan:1999:OFN,
  author =       "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi",
  title =        "Online Feedback for Nested Aggregate Queries with
                 Multi-Threading",
  crossref =     "Atkinson:1999:PTF",
  pages =        "18--29",
  year =         "1999",
  bibdate =      "Fri Jan 12 07:50:37 MST 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldb.bib;
                 http://www.vldb.org/dblp/db/conf/vldb/vldb99.html; OCLC
                 Proceedings database",
  URL =          "http://www.vldb.org/dblp/db/conf/vldb/TanGO99.html",
  acknowledgement = ack-nhfb,
  authorurl =    "http://www.vldb.org/dblp/db/indices/a-tree/t/Tan:Kian=Lee.html;
                 http://www.vldb.org/dblp/db/indices/a-tree/g/Goh:Cheng_Hian.html;
                 http://www.vldb.org/dblp/db/indices/a-tree/o/Ooi:Beng_Chin.html",
}

@Article{Tang:1999:APT,
  author =       "Xinan Tang and Guang R. Gao",
  title =        "Automatically Partitioning Threads for Multithreaded
                 Architectures",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "58",
  number =       "2",
  pages =        "159--189",
  month =        aug,
  year =         "1999",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1999.1551",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Mar 9 09:19:08 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Tang:1999:CRT,
  author =       "Hong Tang and Kai Shen and Tao Yang",
  title =        "Compile\slash run-time support for threaded {MPI}
                 execution on multiprogrammed shared memory machines",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "107--118",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p107-tang/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Taura:1999:SMI,
  author =       "Kenjiro Taura and Kunio Tabata and Akinori Yonezawa",
  title =        "{StackThreads\slash MP}: integrating futures into
                 calling standards",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "60--71",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p60-taura/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Throop:1999:SOS,
  author =       "Joe Throop",
  title =        "Standards: {OpenMP}: Shared-Memory Parallelism from
                 the Ashes",
  journal =      j-COMPUTER,
  volume =       "32",
  number =       "5",
  pages =        "108--109",
  month =        may,
  year =         "1999",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Thu May 6 06:17:23 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/co/books/co1999/pdf/r5108.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Torrant:1999:SMS,
  author =       "Marc Torrant and Muhammad Shaaban and Roy Czernikowski
                 and Ken Hsu",
  title =        "A simultaneous multithreading simulator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "5",
  pages =        "1--5",
  month =        dec,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Vlassov:1999:QMM,
  author =       "V. Vlassov and A. Kraynikov",
  title =        "A Queuing Model of a Multi-threaded Architecture: a
                 Case Study",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1662",
  pages =        "306--??",
  year =         "1999",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Sep 13 16:57:02 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Weissman:1999:HPT,
  author =       "B. Weissman and B. Gomes",
  title =        "High Performance Thread Migration on Clusters of
                 {SMPs}",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "2",
  number =       "2",
  pages =        "??--??",
  month =        "????",
  year =         "1999",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Fri Dec 19 08:14:13 MST 2003",
  bibsource =    "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2abs.html#boris",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Wu:1999:GMC,
  author =       "C.-C. Wu and C. Chen",
  title =        "Grouping Memory Consistency Model for
                 Parallel-Multithreaded Shared-Memory Multiprocessor
                 Systems",
  journal =      j-INT-J-HIGH-SPEED-COMPUTING,
  volume =       "10",
  number =       "1",
  pages =        "53--82",
  month =        mar,
  year =         "1999",
  CODEN =        "IHSCEZ",
  ISSN =         "0129-0533",
  bibdate =      "Mon Feb 25 11:19:21 MST 2002",
  bibsource =    "http://ejournals.wspc.com.sg/ijhsc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 OCLC Article1st database",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Speed Computing
                 (IJHSC)",
}

@Article{Xu:1999:DIT,
  author =       "Zhichen Xu and Barton P. Miller and Oscar Naim",
  title =        "Dynamic instrumentation of threaded applications",
  journal =      j-SIGPLAN,
  volume =       "34",
  number =       "8",
  pages =        "49--59",
  month =        aug,
  year =         "1999",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:06 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p49-xu/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Akkary:2000:CSM,
  author =       "Haitham Akkary and S{\'e}bastien Hily",
  title =        "The Case for Speculative Multithreading on {SMT}
                 Processors",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1940",
  pages =        "59--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:17:15 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400059.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1940/19400059.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Anonymous:2000:NPAa,
  author =       "Anonymous",
  title =        "New Products: {AVP for Linux/FreeBSD UNIX, Kaspersky
                 Lab Ltd.; API PowerRAC Chassis 320, Alpha Processor
                 Inc.; ODBC-ODBC Bridge, Easysoft Ltd.; LinkScan 6.1,
                 Electronic Software Publishing Corporation; Metro-X
                 Enhanced Server CD, Metro Link, Inc.; P-STAT
                 Statistical Software, P-STAT, Inc.; System Manager in a
                 Box v1.0, PegaSoft Canada; PGI Workstation 3.1, PGI;
                 Quick Restore 2.6, Workstation Solutions, Inc.;
                 Threads.h++ and Tools.h++ Professional, Rogue Wave
                 Software; Scriptics Connect 1.0, 1.1, Scriptics
                 Corporation; TapeWare 6.2 Backup Software, Yosemite
                 Technologies, Inc.; DoubleVision for Linux Systems,
                 Tridia Corporation}",
  journal =      j-LINUX-J,
  volume =       "71",
  pages =        "??--??",
  month =        mar,
  year =         "2000",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Thu Sep 21 07:44:12 MDT 2000",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue71/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Anonymous:2000:SLT,
  author =       "Anonymous",
  title =        "Strictly On-Line: {T/TCP: TCP for Transactions by Mark
                 Stacey, Ivan Griffin and John Nelson; POSIX Thread
                 Libraries by Felix Garcia and Javier Fernandez; Linux
                 and Open-Source Applications by Peter Jones and M. B.
                 Jorgenson; Laptops for Linux! by Jason Kroll}",
  journal =      j-LINUX-J,
  volume =       "70",
  pages =        "??--??",
  month =        feb,
  year =         "2000",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Thu Sep 21 16:32:31 MDT 2000",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue70/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://noframes.linuxjournal.com/lj-issues/issue70/3075.html;
                 http://noframes.linuxjournal.com/lj-issues/issue70/3184.html;
                 http://noframes.linuxjournal.com/lj-issues/issue70/3683.html;
                 http://noframes.linuxjournal.com/lj-issues/issue70/3766.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Antoniu:2000:CDP,
  author =       "G. Antoniu and L. Boug{\'e} and R. Namyst and C.
                 P{\'e}rez",
  title =        "Compiling Data-Parallel Programs to a Distributed
                 Runtime Environment with Thread Isomigration",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "10",
  number =       "2/3",
  pages =        "201--??",
  month =        sep,
  year =         "2000",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Wed Apr 18 07:29:37 2001",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/10/1002_03/S01296264001002_03.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ejournals.wspc.com.sg/ppl/10/1002_03/S0129626400000202.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Antoniu:2000:IJC,
  author =       "Gabriel Antoniu and Luc Boug{\'e} and Philip Hatcher
                 and Mark MacBeth and Keith McGuigan and Raymond
                 Namyst",
  title =        "Implementing {Java} Consistency Using a Generic,
                 Multithreaded {DSM} Runtime System",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "560--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000560.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18000560.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Aumage:2000:PAM,
  author =       "Olivier Aumage and Luc Boug{\'e} and Raymond Namyst",
  title =        "A Portable and Adaptative Multi-protocol Communication
                 Library for Multithreaded Runtime Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "1136--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001136.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18001136.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Becker:2000:JSU,
  author =       "Pete Becker",
  title =        "The Journeyman's Shop: Unraveling Multithreading",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "8",
  pages =        "71--??",
  month =        aug,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:27 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0008/0008toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Sometimes you have to spend a lot of time on just a
                 little bit of code, to avoid spending much more time
                 not knowing where to begin debugging.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Bedy:2000:VSM,
  author =       "Michael Bedy and Steve Carr and Xianlong Huang and
                 Ching-Kuang Shene",
  title =        "A visualization system for multithreaded programming",
  journal =      j-SIGCSE,
  volume =       "32",
  number =       "1",
  pages =        "1--5",
  month =        mar,
  year =         "2000",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/331795.331798",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Mon Nov 19 10:05:03 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Berger:2000:HSMa,
  author =       "Emery D. Berger and Kathryn S. McKinley and Robert D.
                 Blumofe and Paul R. Wilson",
  title =        "{Hoard}: a scalable memory allocator for multithreaded
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "117--128",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Berger:2000:HSMb,
  author =       "Emery D. Berger and Kathryn S. McKinley and Robert D.
                 Blumofe and Paul R. Wilson",
  title =        "{Hoard}: a Scalable Memory Allocator for
                 Multithreaded Applications",
  journal =      j-SIGPLAN,
  volume =       "35",
  number =       "11",
  pages =        "117--128",
  month =        nov,
  year =         "2000",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:19 MST 2003",
  bibsource =    "http://foothill.lcs.mit.edu/asplos2k/program.html;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Berger:2000:HSMc,
  author =       "Emery D. Berger and Kathryn S. McKinley and Robert D.
                 Blumofe and Paul R. Wilson",
  title =        "{Hoard}: a scalable memory allocator for multithreaded
                 applications",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "5",
  pages =        "117--128",
  month =        dec,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Beyls:2000:CGM,
  author =       "K. E. Beyls and E. H. D'Hollander",
  title =        "Compiler Generated Multithreading to Alleviate Memory
                 Latency",
  journal =      j-J-UCS,
  volume =       "6",
  number =       "10",
  pages =        "968--993",
  day =          "28",
  month =        oct,
  year =         "2000",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Wed Feb 20 07:23:07 MST 2002",
  bibsource =    "http://www.jucs.org/jucs;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.jucs.org/jucs_6_10/compiler_generated_multithreading_to",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Article{Bhandarkar:2000:PPM,
  author =       "Suchendra M. Bhandarkar and Shankar R.
                 Chandrasekaran",
  title =        "Parallel Parsing of {MPEG} Video in a Multi-threaded
                 Multiprocessor Environment",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "194--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000194.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18000194.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bolding:2000:MSM,
  author =       "Barry Bolding and Kim Baldridge",
  title =        "Multithreaded shared memory parallel implementation of
                 the electronic structure code {GAMESS}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "128",
  number =       "1--2",
  pages =        "55--66",
  day =          "9",
  month =        jun,
  year =         "2000",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(00)00067-9",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:40:43 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465500000679",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Borkenhagen:2000:MPP,
  author =       "J. M. Borkenhagen and R. J. Eickemeyer and R. N. Kalla
                 and S. R. Kunkel",
  title =        "A multithreaded {PowerPC} processor for commercial
                 servers",
  journal =      j-IBM-JRD,
  volume =       "44",
  number =       "6",
  pages =        "885--898",
  month =        nov,
  year =         "2000",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Sat Feb 24 09:44:45 MST 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/446/borkenhagen.html",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0224",
}

@Article{Boussinot:2000:JTS,
  author =       "Fr{\'e}d{\'e}ric Boussinot and Jean-Ferdy Susini",
  title =        "{Java} threads and {SugarCubes}",
  journal =      j-SPE,
  volume =       "30",
  number =       "5",
  pages =        "545--566",
  day =          "25",
  month =        apr,
  year =         "2000",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/(SICI)1097-024X(20000425)30:5<545::AID-SPE308>3.0.CO;2-Q",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Tue Mar 13 06:45:44 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/71004433/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=71004433&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
}

@Article{Bova:2000:DLP,
  author =       "Steve W. Bova and Clay P. Breshears and Christine E.
                 Cuicchi and Zeki Demirbilek and Henry A. Gabb",
  title =        "Dual-Level Parallel Analysis of Harbor Wave Response
                 Using {MPI} and {OpenMP}",
  journal =      j-IJHPCA,
  volume =       "14",
  number =       "1",
  pages =        "49--64",
  month =        "Spring",
  year =         "2000",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Sep 12 12:39:11 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@InCollection{Cahir:2000:PMM,
  author =       "Margaret Cahir and Robert Moench and Alice E.
                 Koniges",
  title =        "Programming Models and Methods",
  crossref =     "Koniges:2000:ISP",
  chapter =      "3",
  pages =        "27--54",
  year =         "2000",
  bibdate =      "Fri Feb 04 18:32:51 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Discusses PVM, MPI, SHMEM, High-Performance Fortran,
                 and POSIX threads.",
  acknowledgement = ack-nhfb,
}

@Article{Cahoon:2000:EPD,
  author =       "Brendon Cahoon and Kathryn S. McKinley and Zhihong
                 Lu",
  title =        "Evaluating the performance of distributed
                 architectures for information retrieval using a variety
                 of workloads",
  journal =      j-TOIS,
  volume =       "18",
  number =       "1",
  pages =        "1--43",
  month =        jan,
  year =         "2000",
  CODEN =        "ATISET",
  ISSN =         "1046-8188",
  ISSN-L =       "0734-2047",
  bibdate =      "Tue Sep 26 09:34:01 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tois/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tois/2000-18-1/p1-cahoon/",
  abstract =     "The information explosion across the Internet and
                 elsewhere offers access to an increasing number of
                 document collections. In order for users to effectively
                 access these collections, information retrieval (IR)
                 systems must provide coordinated, concurrent, and
                 distributed access. In this article, we explore how to
                 achieve scalable performance in a distributed system
                 for collection sizes ranging from 1GB to 128GB. We
                 implement a fully functional distributed IR system
                 based on a multithreaded version of the Inquery
                 simulation model. We measure performance as a function
                 of system parameters such as client command rate,
                 number of document collections, ter ms per query, query
                 term frequency, number of answers returned, and command
                 mixture. Our results show that it is important to model
                 both query and document commands because the
                 heterogeneity of commands significantly impacts
                 performance. Based on our results, we recommend simple
                 changes to the prototype and evaluate the changes using
                 the simulator. Because of the significant resource
                 demands of information retrieval, it is not difficult
                 to generate workloads that overwhelm system resources
                 regardless of the architecture. However under some
                 realistic workloads, we demonstrate system
                 organizations for which response time gracefully
                 degrades as the workload increases and performance
                 scales with the number of processors. This scalable
                 architecture includes a surprisingly small number of
                 brokers through which a large number of clients and
                 servers communicate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Information Systems",
  keywords =     "distributed information retrieval architectures",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Distributed Systems
                 (C.2.4); Computer Systems Organization --- Performance
                 of Systems (C.4); Computer Systems Organization ---
                 Performance of Systems (C.4): {\bf Performance
                 attributes}; Information Systems --- Information
                 Storage and Retrieval --- Systems and Software
                 (H.3.4)",
}

@Article{Calkins:2000:ITT,
  author =       "Charles Calkins",
  title =        "Integrating Threads with Template Classes",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "5",
  pages =        "32--??",
  month =        may,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:26 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "It's obviously a good idea to encapsulate a thread as
                 an object. It is less obvious how to get all the
                 interfaces right.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Carr:2000:PCL,
  author =       "Steve Carr and Ching-Kuang Shene",
  title =        "A portable class library for teaching multithreaded
                 programming",
  journal =      j-SIGCSE,
  volume =       "32",
  number =       "3",
  pages =        "124--127",
  month =        sep,
  year =         "2000",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/353519.343138",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 16:56:43 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{ChassindeKergommeaux:2000:PIV,
  author =       "J. {Chassin de Kergommeaux} and B. Stein and P. E.
                 Bernard",
  title =        "{Paj{\'e}}, an interactive visualization tool for
                 tuning multi-threaded parallel applications",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "26",
  number =       "10",
  pages =        "1253--1274",
  day =          "15",
  month =        aug,
  year =         "2000",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sat Oct 28 17:44:14 MDT 2000",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/article.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Choi:2000:SCP,
  author =       "Sung-Eun Choi and E. Christopher Lewis",
  title =        "A study of common pitfalls in simple multi-threaded
                 programs",
  journal =      j-SIGCSE,
  volume =       "32",
  number =       "1",
  pages =        "325--329",
  month =        mar,
  year =         "2000",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/331795.331879",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Mon Nov 19 10:05:03 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  abstract =     "It is generally acknowledged that developing correct
                 multi-threaded codes is difficult, because threads may
                 interact with each other in unpredictable ways. The
                 goal of this work is to discover common multi-threaded
                 programming pitfalls, the knowledge of which will be
                 useful in instructing new programmers and in developing
                 tools to aid in multi-threaded programming. To this
                 end, we study multi-threaded applications written by
                 students from introductory operating systems courses.
                 Although the applications are simple, careful
                 inspection and the use of an automatic race detection
                 tool reveal a surprising quantity and variety of
                 synchronization errors. We describe and discuss these
                 errors, evaluate the role of automated tools, and
                 propose new tools for use in the instruction of
                 multi-threaded programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Book{Christopher:2000:HPJ,
  author =       "Thomas Christopher and George Thiruvathukal",
  title =        "High Performance {Java} Platform Computing:
                 Multithreaded and Networked Programming",
  publisher =    pub-PH,
  address =      pub-PH:adr,
  pages =        "xxii + 409",
  year =         "2000",
  ISBN =         "0-13-016164-0",
  ISBN-13 =      "978-0-13-016164-2",
  LCCN =         "????",
  bibdate =      "Tue Feb 20 18:03:50 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$49.99",
  URL =          "http://www.sun.com/books/catalog/christopher/",
  acknowledgement = ack-nhfb,
}

@Article{Corbett:2000:USA,
  author =       "James C. Corbett",
  title =        "Using shape analysis to reduce finite-state models of
                 concurrent {Java} programs",
  journal =      j-TOSEM,
  volume =       "9",
  number =       "1",
  pages =        "51--93",
  month =        jan,
  year =         "2000",
  CODEN =        "ATSMER",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Fri Apr 20 08:21:35 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tosem/2000-9-1/p51-corbett/p51-corbett.pdf;
                 http://www.acm.org/pubs/citations/journals/tosem/2000-9-1/p51-corbett/",
  abstract =     "Finite-state verification (e.g., model checking)
                 provides a powerful means to detect concurrency errors,
                 which are often subtle and difficult to reproduce.
                 Nevertheless, widespread use of this technology by
                 developers is unlikely until tools provide automated
                 support for extracting the required finite-state models
                 directly from program source. Unfortunately, the
                 dynamic features of modern languages such as Java
                 complicate the construction of compact finite-state
                 models for verification. In this article, we show how
                 shape analysis, which has traditionally been used for
                 computing alias information in optimizers, can be used
                 to greatly reduce the size of finite-state models of
                 concurrent Java programs by determining which
                 heap-allocated variables are accessible only by a
                 single thread, and which shared variables are protected
                 by locks. We also provide several other state-space
                 reductions based on the semantics of Java monitors. A
                 prototype of the reductions demonstrates their
                 effectiveness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J790",
  keywords =     "concurrent systems; finite-state verification; Java;
                 model extraction; modeling; shape analysis; state-space
                 reductions",
  subject =      "Software --- Software Engineering --- Software/Program
                 Verification (D.2.4)",
}

@Article{Cui:2000:MPC,
  author =       "J. Cui and J. L. Bordim and K. Nakano and T. Hayashi
                 and N. Ishii",
  title =        "Multithreaded Parallel Computer Model with Performance
                 Evaluation",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "155--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000155.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18000155.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Danjean:2000:IKA,
  author =       "Vincent Danjean and Raymond Namyst and Robert D.
                 Russell",
  title =        "Integrating Kernel Activations in a Multithreaded
                 Runtime System on Top of {L} {INUX}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "1160--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001160.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18001160.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Dill:2000:MCJ,
  author =       "David Dill",
  title =        "Model checking {Java} programs (abstract only)",
  journal =      j-SIGSOFT,
  volume =       "25",
  number =       "5",
  pages =        "179",
  month =        sep,
  year =         "2000",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/347636.349113",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:14:00 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib",
  abstract =     "Automatic state exploration tools (model checkers)
                 have had some success when applied to protocols and
                 hardware designs, but there are fewer success stories
                 about software. This is unfortunate, since the software
                 problem is worsening even faster than the hardware and
                 protocol problems. Model checking of concurrent
                 programs is especially interesting, because they are
                 notoriously difficult to test, analyze, and debug by
                 other methods. This talk will be a description of our
                 initial efforts to check Java programs using a model
                 checker. The model checker supports dynamic allocation,
                 thread creation, and recursive procedures (features
                 that are not necessary for hardware verification), and
                 has some special optimizations and checks tailored to
                 multi-threaded Java program. I will also discuss some
                 of the challenges for future efforts in this area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Duda:2000:BVT,
  author =       "Kenneth J. Duda and David R. Cheriton",
  title =        "Borrowed-virtual-time {(BVT)} scheduling: supporting
                 latency-sensitive threads in a general-purpose
                 scheduler",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "2",
  pages =        "27--28",
  month =        apr,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@InProceedings{Engelschall:2000:PMS,
  author =       "Ralf S. Engelschall",
  title =        "Portable Multithreading --- The Signal Stack Trick for
                 User-Space Thread Creation",
  crossref =     "USENIX:2000:UAT",
  pages =        "239--249",
  year =         "2000",
  bibdate =      "Tue Oct 15 09:53:32 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://www.usenix.org/events/usenix2000/general/engelschall.html",
  acknowledgement = ack-nhfb,
}

@Article{Flautner:2000:TLPa,
  author =       "Kristi{\'a}n Flautner and Rich Uhlig and Steve
                 Reinhardt and Trevor Mudge",
  title =        "Thread-level parallelism and interactive performance
                 of desktop applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "129--138",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Flautner:2000:TLPb,
  author =       "Kriszti{\'a}n Flautner and Rich Uhlig and Steve
                 Reinhardt and Trevor Mudge",
  title =        "Thread Level Parallelism and Interactive Performance
                 of Desktop Applications",
  journal =      j-SIGPLAN,
  volume =       "35",
  number =       "11",
  pages =        "129--138",
  month =        nov,
  year =         "2000",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145.357001",
  ISBN =         "1-58113-317-0",
  ISBN-13 =      "978-1-58113-317-2",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:19 MST 2003",
  bibsource =    "http://foothill.lcs.mit.edu/asplos2k/program.html;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://delivery.acm.org/10.1145/360000/357001/p129-flautner.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "IA-64",
}

@Article{Flautner:2000:TLPc,
  author =       "Kristi{\'a}n Flautner and Rich Uhlig and Steve
                 Reinhardt and Trevor Mudge",
  title =        "Thread-level parallelism and interactive performance
                 of desktop applications",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "5",
  pages =        "129--138",
  month =        dec,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Garcia:2000:PTL,
  author =       "Felix Garcia and Javier Fernandez",
  title =        "{POSIX} Thread Libraries",
  journal =      j-LINUX-J,
  volume =       "70",
  pages =        "??--??",
  month =        feb,
  year =         "2000",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Thu Sep 21 16:46:44 MDT 2000",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue70/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://noframes.linuxjournal.com/lj-issues/issue/3184.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@Article{Geppert:2000:MBG,
  author =       "L. Geppert",
  title =        "Microprocessors: the off-beat generation",
  journal =      j-IEEE-SPECTRUM,
  volume =       "37",
  number =       "7",
  pages =        "44--49",
  month =        jul,
  year =         "2000",
  CODEN =        "IEESAM",
  DOI =          "https://doi.org/10.1109/6.852051",
  ISSN =         "0018-9235 (print), 1939-9340 (electronic)",
  ISSN-L =       "0018-9235",
  bibdate =      "Sat Jan 18 12:29:46 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeespectrum2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Spectrum",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6",
  keywords =     "Biology computing; Bonding; Broadband communication;
                 broadband networks; Electronics industry;
                 microprocessor chips; microprocessors; Microprocessors;
                 multimedia broadband communications; multimedia
                 communication; multimedia computing; Multithreading;
                 off-beat generation; performance; Personal
                 communication networks; programmable controllers;
                 programmable logic; Real time systems; Supercomputers;
                 supercomputing; Workstations",
}

@Article{Gontmakher:2000:JCN,
  author =       "Alex Gontmakher and Assaf Schuster",
  title =        "{Java} consistency: nonoperational characterizations
                 for {Java} memory behavior",
  journal =      j-TOCS,
  volume =       "18",
  number =       "4",
  pages =        "333--386",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2000-18-4/p333-gontmakher/p333-gontmakher.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2000-18-4/p333-gontmakher/",
  abstract =     "The Java Language Specification (JLS) [Gosling et al.
                 1996] provides an operational definition for the
                 consistency of shared variables. The definition remains
                 unchanged in the JLS 2nd edition, currently under peer
                 review, which relies on a specific abstract machine as
                 its underlying model, is very complicated. Several
                 subsequent works have tried to simplify and formalize
                 it. However, these revised definitions are also
                 operational, and thus have failed to highlight the
                 intuition behind the original specification. In this
                 work we provide a complete nonoperational specification
                 for Java and for the JVM, excluding synchronized
                 operations. We provide a simpler definition, in which
                 we clearly distinguish the consistency model that is
                 promised to the programmer from that which should be
                 implemented in the JVM. This distinction, which was
                 implicit in the original definition, is crucial for
                 building the JVM. We find that the programmer model is
                 strictly weaker than that of the JVM, and precisely
                 define their discrepancy. Moreover, our definition is
                 independent of any specific (or even abstract) machine,
                 and can thus be used to verify JVM implementations and
                 compiler optimizations on any platform. Finally, we
                 show the precise range of consistency relaxations
                 obtainable for the Java memory model when a certain
                 compiler optimization-- called {\em prescient stores\/}
                 in JLS--is applicable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  generalterms = "Verification",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Java memory models; multithreading; nonoperational
                 specification",
  subject =      "Hardware --- Memory Structures --- Performance
                 Analysis and Design Aids** (B.3.3): {\bf Formal
                 models**}",
}

@Article{Gopinath:2000:PSB,
  author =       "K. Gopinath and M. K. Krishna Narasimhan",
  title =        "Performance of Switch Blocking on Multithreaded
                 Architectures",
  journal =      j-J-UCS,
  volume =       "6",
  number =       "10",
  pages =        "928--947",
  day =          "28",
  month =        oct,
  year =         "2000",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Wed Feb 20 07:23:07 MST 2002",
  bibsource =    "http://www.jucs.org/jucs;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.jucs.org/jucs_6_10/performance_of_switch_blocking",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Book{Holub:2000:TJT,
  author =       "Allen I. Holub",
  title =        "Taming {Java} Threads",
  publisher =    pub-APRESS,
  address =      pub-APRESS:adr,
  pages =        "x + 300",
  year =         "2000",
  ISBN =         "1-893115-10-0",
  ISBN-13 =      "978-1-893115-10-1",
  LCCN =         "QA76.73.J38 H635 2000",
  bibdate =      "Fri May 10 12:18:17 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www1.fatbrain.com/asp/bookinfo/bookinfo.asp?theisbn=1893115100&from=NCN454",
  price =        "US\$34.95",
  acknowledgement = ack-nhfb,
  keywords =     "Java (computer program language); threads (computer
                 programs)",
}

@Article{Horwood:2000:DMA,
  author =       "Peter Horwood and Shlomo Wygodny and Martin Zardecki",
  title =        "Debugging Multithreaded Applications",
  journal =      j-DDJ,
  volume =       "25",
  number =       "3",
  pages =        "32, 34--37",
  month =        mar,
  year =         "2000",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Nov 9 08:25:14 MST 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2000/2000_03/dbgmulti.txt",
  abstract =     "It is often significantly harder to locate and test
                 for bugs in multithreaded and multiprocess applications
                 than for nonthreaded, single process situations. Our
                 authors describe some of the problems with
                 multithreaded applications and discuss common debugging
                 techniques. Additional resources include dbgmulti.txt
                 (listings).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Howard:2000:UPW,
  author =       "David M. Howard",
  title =        "Using Predicate Waits with {Win32} Threads",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "5",
  pages =        "18--??",
  month =        may,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:26 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Most Win32 synchronization primitives are just that
                 --- primitive. But you can use them to build queues
                 that are safe and easy to use.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Book{Hyde:2000:JTP,
  author =       "Paul Hyde",
  title =        "{Java} thread programming",
  publisher =    pub-SAMS,
  address =      pub-SAMS:adr,
  pages =        "iv + 510",
  year =         "2000",
  ISBN =         "0-672-31585-8",
  ISBN-13 =      "978-0-672-31585-5",
  LCCN =         "QA76.73.J38 H93 1999",
  bibdate =      "Wed Feb 21 06:02:14 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Keller:2000:JUS,
  author =       "J. Keller and T. Ungerer",
  title =        "{J.UCS} Special Issue on Multithreaded Processors and
                 Chip-Multiprocessors",
  journal =      j-J-UCS,
  volume =       "6",
  number =       "10",
  pages =        "906--907",
  day =          "28",
  month =        oct,
  year =         "2000",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Wed Feb 20 07:23:07 MST 2002",
  bibsource =    "http://www.jucs.org/jucs;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.jucs.org/jucs_6_10/j_ucs_special_issue",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Article{Kleber:2000:TSA,
  author =       "Jeff Kleber",
  title =        "Thread-Safe Access to Collections",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "5",
  pages =        "36--??",
  month =        may,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:26 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The best place to store a thread lock for a shared
                 container is somewhere inside the container --- deep
                 inside.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Lafreniere:2000:SMD,
  author =       "David Lafreniere",
  title =        "State Machine Design in {C++}",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "5",
  pages =        "58--??",
  month =        may,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:26 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "It's not all that hard to implement a finite-state
                 machine, unless it's very large, and you have to worry
                 about multithreading, and \ldots{}.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Book{Lewis:2000:MPJ,
  author =       "Bil Lewis and Daniel J. Berg",
  title =        "Multithreaded Programming with {Java} Technology",
  publisher =    pub-SUN-MICROSYSTEMS-PRESS,
  address =      pub-SUN-MICROSYSTEMS-PRESS:adr,
  pages =        "xxv + 461",
  year =         "2000",
  ISBN =         "0-13-017007-0",
  ISBN-13 =      "978-0-13-017007-1",
  LCCN =         "QA76.73.J38 L488 2000",
  bibdate =      "Fri Apr 11 15:58:52 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$39.99",
  series =       "Sun BluePrints Program",
  URL =          "http://www.sun.com/books/catalog/lewis3/index.html",
  acknowledgement = ack-nhfb,
}

@Article{Ling:2000:AOT,
  author =       "Yibei Ling and Tracy Mullen and Xiaola Lin",
  title =        "Analysis of optimal thread pool size",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "2",
  pages =        "42--55",
  month =        apr,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Lowy:2000:MPO,
  author =       "Juval Lowy",
  title =        "Making Primitive Objects Thread Safe",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "3",
  pages =        "85--??",
  month =        mar,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:25 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0003/0003toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "All sorts of things need thread locks. A fairly simple
                 template or two can do the job.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@InProceedings{Matsushita:2000:MSC,
  author =       "Satoshi Matsushita and Sunao Torii and Masahiko Nomura
                 and Toshiaki Inoue and Atsufumi Shibayama and Sachiko
                 Shimada and Taku Osawa and Hiroaki Inoue and Kouichiro
                 Minami and Junji Sakai and Yoshiyuki Ito and Yuichi
                 Nakamura and Masato Edahiro and Naoki Nishi and
                 Masakazu Yamashina",
  title =        "{Merlot}: a Single-Chip Tightly Coupled Four-Way
                 Multi-Thread Processor",
  crossref =     "Anonymous:2000:CCI",
  pages =        "??--??",
  year =         "2000",
  bibdate =      "Mon Jan 08 05:28:04 2001",
  bibsource =    "http://www.coolchips.org/index-cool3.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We developed an on-chip four-way multiprocessor, MP98
                 version 1, code-named Merlot. It is fabricated with a
                 0.15 $ \mu $ m process and has a die size of 110 mm2.
                 Merlot is a high performance embedded processor for
                 intelligent appliances. We extract a higher degree of
                 parallelism with low voltage operation. In our
                 presentation, we describe our multi-threading model.
                 Then, we explain Merlot's pipeline architecture,
                 focusing on fast thread creation and memory renaming.
                 We also describe our on-chip SDRAM interface which has
                 a throughput greater than 1 GB/sec and cache miss
                 penalty less than 100 ns. Finally, we show a
                 performance estimation for speech recognition and MPEG2
                 code, power dissipation, and average memory latency.
                 Restructured speech recognition code was compiled with
                 directives, and IPC of 2.72 is estimated.",
  acknowledgement = ack-nhfb,
}

@Article{Metzner:2000:MMR,
  author =       "A. Metzner and J. Niehaus",
  title =        "{MSparc}: Multithreading in Real-Time Architectures",
  journal =      j-J-UCS,
  volume =       "6",
  number =       "10",
  pages =        "1034--1051",
  day =          "28",
  month =        oct,
  year =         "2000",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Wed Feb 20 07:23:07 MST 2002",
  bibsource =    "http://www.jucs.org/jucs;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.jucs.org/jucs_6_10/msparc_multithreading_in_real",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Article{Mohamed:2000:DDM,
  author =       "A. S. Mohamed and A. Galal and I. Khalil and K. Sobh
                 and M. Selim",
  title =        "{Dispo}: Distributed Multi-Threaded Execution of
                 {Prolog} Programs",
  journal =      j-INT-J-COMPUT-APPL,
  volume =       "22",
  number =       "2",
  pages =        "100--108",
  year =         "2000",
  DOI =          "https://doi.org/10.1080/1206212X.2000.11441606",
  ISSN =         "1206-212X (print), 1925-7074 (electronic)",
  ISSN-L =       "1206-212X",
  bibdate =      "Sat Apr 21 17:19:15 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ijca.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.tandfonline.com/doi/full/10.1080/1206212X.2000.11441606",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Computer Applications",
  journal-URL =  "https://www.tandfonline.com/loi/tjca20",
  online-date =  "10 Jul 2015",
}

@Article{Mount:2000:ADP,
  author =       "John Mount",
  title =        "Automatic Detection Of Potential Deadlock",
  journal =      j-DDJ,
  volume =       "25",
  number =       "12",
  pages =        "64, 66--70, 72",
  month =        dec,
  year =         "2000",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Wed Nov 8 15:09:25 MST 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2000/2000_12/deadlock.txt;
                 http://www.ddj.com/ftp/2000/2000_12/deadlock.zip",
  abstract =     "Deadlock can occur when a number of consumers
                 (typically threads) access a set of resources in an
                 unacceptable pattern. To combat it, John presents a
                 solution based on run-time lock analysis that analyzes
                 all transactions. Additional resources include
                 deadlock.txt (listings) and deadlock.zip (source
                 code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Nemeth:2000:AMD,
  author =       "Zsolt N{\'e}meth",
  title =        "Abstract machine design on a multithreaded
                 architecture",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "16",
  number =       "6",
  pages =        "705--716",
  month =        apr,
  year =         "2000",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Feb 27 12:41:20 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/0167739X;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/19/19/41/29/36/abstract.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Nielsen:2000:MTN,
  author =       "Ida M. B. Nielsen and Curtis L. Janssen",
  title =        "Multi-threading: a new dimension to massively parallel
                 scientific computation",
  journal =      j-COMP-PHYS-COMM,
  volume =       "128",
  number =       "1--2",
  pages =        "238--244",
  day =          "9",
  month =        jun,
  year =         "2000",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(00)00062-X",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:40:43 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S001046550000062X",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Oyama:2000:OCC,
  author =       "Yoshihiro Oyama and Kenjiro Taura and Akinori
                 Yonezawa",
  title =        "Online Computation of Critical Paths for Multithreaded
                 Languages",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "301--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000301.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18000301.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Peterson:2000:CCT,
  author =       "Mark Peterson",
  title =        "{C/C++} Tips: Tip \#4: Self Destructing Threads",
  journal =      j-CCCUJ,
  volume =       "18",
  number =       "12",
  pages =        "44--??",
  month =        dec,
  year =         "2000",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:29 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2000/0012/0012toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A way to make threads easier to manage.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Pulleyn:2000:EPM,
  author =       "Ivan Pulleyn",
  title =        "Embedding {Python} in Multi-Threaded {C\slash C++}
                 Applications",
  journal =      j-LINUX-J,
  volume =       "73",
  pages =        "??--??",
  month =        may,
  year =         "2000",
  CODEN =        "LIJOFX",
  ISSN =         "1075-3583 (print), 1938-3827 (electronic)",
  ISSN-L =       "1075-3583",
  bibdate =      "Thu Sep 21 07:44:12 MDT 2000",
  bibsource =    "http://noframes.linuxjournal.com/lj-issues/issue73/index.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Python provides a clean intuitive interface to
                 complex,threaded applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "Linux journal",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J508",
}

@InProceedings{Ranganathan:2000:AMT,
  author =       "M. Ranganathan and Mark Bednarek and Fernand Pors and
                 Doug Montgomery",
  title =        "{AGNI}: a Multi-threaded Middleware for Distributed
                 Scripting",
  crossref =     "USENIX:2000:PUT",
  pages =        "??--??",
  year =         "2000",
  bibdate =      "Wed Oct 16 05:17:16 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://db.usenix.org/publications/library/proceedings/tcl2k/ranganathan.html",
  acknowledgement = ack-nhfb,
}

@Article{Redstone:2000:AOSa,
  author =       "Joshua A. Redstone and Susan J. Eggers and Henry M.
                 Levy",
  title =        "An analysis of operating system behavior on a
                 simultaneous multithreaded architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "245--256",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Redstone:2000:AOSb,
  author =       "Joshua A. Redstone and Susan J. Eggers and Henry M.
                 Levy",
  title =        "An Analysis of Operating System Behavior on a
                 Simultaneous Multithreaded Architecture",
  journal =      j-SIGPLAN,
  volume =       "35",
  number =       "11",
  pages =        "245--256",
  month =        nov,
  year =         "2000",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:19 MST 2003",
  bibsource =    "http://foothill.lcs.mit.edu/asplos2k/program.html;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Redstone:2000:AOSc,
  author =       "Joshua A. Redstone and Susan J. Eggers and Henry M.
                 Levy",
  title =        "An analysis of operating system behavior on a
                 simultaneous multithreaded architecture",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "5",
  pages =        "245--256",
  month =        dec,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Reinhardt:2000:TFD,
  author =       "Steven K. Reinhardt and Shubhendu S. Mukherjee",
  title =        "Transient fault detection via simultaneous
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "25--36",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Samorodin:2000:SFS,
  author =       "Steven H. Samorodin and Raju Pandey",
  title =        "Supporting Flexible Safety and Sharing in
                 Multi-threaded Environments",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1800",
  pages =        "1184--??",
  year =         "2000",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 09:16:18 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001184.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1800/18001184.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Shinjo:2000:DCEa,
  author =       "Yasushi Shinjo and Calton Pu",
  title =        "Developing correct and efficient multithreaded
                 programs with thread-specific data and a partial
                 evaluator",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "2",
  pages =        "33--33",
  month =        apr,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Shinjo:2000:DCEb,
  author =       "Yasushi Shinjo",
  title =        "Developing correct and efficient multithreaded
                 programs with thread-specific data and a partial
                 evaluator",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "2",
  pages =        "40--40",
  month =        apr,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Snavely:2000:SJSa,
  author =       "Allan Snavely and Dean M. Tullsen",
  title =        "Symbiotic job scheduling for a simultaneous
                 multithreaded processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "234--244",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Snavely:2000:SJSb,
  author =       "Allan Snavely and Dean M. Tullsen",
  title =        "Symbiotic Jobscheduling for a Simultaneous
                 Multithreading Processor",
  journal =      j-SIGPLAN,
  volume =       "35",
  number =       "11",
  pages =        "234--244",
  month =        nov,
  year =         "2000",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:19 MST 2003",
  bibsource =    "http://foothill.lcs.mit.edu/asplos2k/program.html;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Snavely:2000:SJSc,
  author =       "Allan Snavely and Dean M. Tullsen",
  title =        "Symbiotic jobscheduling for a simultaneous
                 multithreaded processor",
  journal =      j-OPER-SYS-REV,
  volume =       "34",
  number =       "5",
  pages =        "234--244",
  month =        dec,
  year =         "2000",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Steffan:2000:SAT,
  author =       "J. Greggory Steffan and Christopher B. Colohan and
                 Antonia Zhai and Todd C. Mowry",
  title =        "A scalable approach to thread-level speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "1--12",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Tan:2000:PEN,
  author =       "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi",
  title =        "Progressive evaluation of nested aggregate queries",
  journal =      j-VLDB-J,
  volume =       "9",
  number =       "3",
  pages =        "261--278",
  month =        dec,
  year =         "2000",
  CODEN =        "VLDBFR",
  ISSN =         "1066-8888 (print), 0949-877X (electronic)",
  ISSN-L =       "1066-8888",
  bibdate =      "Mon Jun 23 10:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In many decision-making scenarios, decision makers
                 require rapid feedback to their queries, which
                 typically involve aggregates. The traditional {\em
                 blocking execution model\/} can no longer meet the
                 demands of these users. One promising approach in the
                 literature, called {\em online aggregation}, evaluates
                 an aggregation query progressively as follows: as soon
                 as certain data have been evaluated, approximate
                 answers are produced with their respective running
                 confidence intervals; as more data are examined, the
                 answers and their corresponding running confidence
                 intervals are refined. In this paper, we extend this
                 approach to handle nested queries with aggregates
                 (i.e., at least one inner query block is an aggregate
                 query) by providing users with (approximate) answers
                 progressively as the inner aggregation query blocks are
                 evaluated. We address the new issues pose by nested
                 queries. In particular, the answer space begins with a
                 superset of the final answers and is refined as the
                 aggregates from the inner query blocks are refined. For
                 the intermediary answers to be meaningful, they have to
                 be interpreted with the aggregates from the inner
                 queries. We also propose a {\em multi-threaded model\/}
                 in evaluating such queries: each query block is
                 assigned to a thread, and the threads can be evaluated
                 concurrently and independently. The time slice across
                 the threads is {\em nondeterministic\/} in the sense
                 that the user controls the relative rate at which these
                 subqueries are being evaluated. For {\em enumerative\/}
                 nested queries, we propose a priority-based evaluation
                 strategy to present answers that are certainly in the
                 final answer space first, before presenting those whose
                 validity may be affected as the inner query aggregates
                 are refined. We implemented a prototype system using
                 Java and evaluated our system. Results for nested
                 queries with a level and multiple levels of nesting are
                 reported. Our results show the effectiveness of the
                 proposed mechanisms in providing progressive feedback
                 that reduces the initial waiting time of users
                 significantly without sacrificing the quality of the
                 answers.",
  acknowledgement = ack-nhfb,
  fjournal =     "VLDB Journal: Very Large Data Bases",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J869",
  keywords =     "approximate answers; multi-threading; nested aggregate
                 queries; online aggregation; progressive query
                 processing",
}

@Article{Tang:2000:PTR,
  author =       "Hong Tang and Kai Shen and Tao Yang",
  title =        "Program transformation and runtime support for
                 threaded {MPI} execution on shared-memory machines",
  journal =      j-TOPLAS,
  volume =       "22",
  number =       "4",
  pages =        "673--700",
  year =         "2000",
  CODEN =        "ATPSDT",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Apr 17 10:05:24 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/toplas/2000-22-4/p673-tang/",
  abstract =     "Parallel programs written in MPI have been widely used
                 for developing high-performance applications on various
                 platforms. Because of a restriction of the MPI
                 computation model, conventional MPI implementations on
                 shared-memory machines map each MPI node to an OS
                 process, which can suffer serious performance
                 degradation in the presence of multiprogramming. This
                 paper studies compile-time and runtime techniques for
                 enhancing performance portability of MPI code running
                 on multiprogrammed shared-memory machines. The proposed
                 techniques allow MPI nodes to be executed safety and
                 efficiently as threads. Compile-time transformation
                 eliminates global and static variables in C code using
                 node-specific data. The runtime support includes an
                 efficient and provably correct communication protocol
                 that uses lock-free data structure and takes advantage
                 of address space sharing among threads. The experiments
                 on SGI Origin 2000 show that our MPI prototype called
                 TMPI using the proposed techniques is competitive with
                 SGI's native MPI implementation in a dedicated
                 environment, and that it has significant performance
                 advantages in a multiprogrammed environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  generalterms = "Algorithms; Design; Experimentation; Languages;
                 Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "lock-free synchronization; MPI; multiprogrammed
                 environments; program transformation; shared-memory
                 machines; threaded execution",
  subject =      "Hardware --- Memory Structures --- Design Styles
                 (B.3.2): {\bf Shared memory}; Software --- Programming
                 Techniques --- Concurrent Programming (D.1.3): {\bf
                 Parallel programming}; Software --- Programming
                 Languages --- Language Classifications (D.3.2): {\bf
                 Concurrent, distributed, and parallel languages};
                 Software --- Programming Languages --- Processors
                 (D.3.4): {\bf Preprocessors}; Software --- Programming
                 Languages --- Processors (D.3.4): {\bf Run-time
                 environments}; Software --- Operating Systems ---
                 Process Management (D.4.1): {\bf
                 Multiprocessing/multiprogramming/multitasking}; Data
                 --- Data Structures (E.1): {\bf Lists, stacks, and
                 queues}",
}

@InProceedings{Theobald:2000:LCE,
  author =       "Kevin B. Theobald and Gagan Agrawal and Rishi Kumar
                 and Gerd Heber and Guang R. Gao and Paul Stodghill and
                 Keshav Pingali",
  title =        "Landing {CG} on {EARTH}: a Case Study of
                 Fine-Grained Multithreading on an Evolutionary Path",
  crossref =     "ACM:2000:SHP",
  pages =        "47--47",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:42 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc2000.org/proceedings/techpapr/papers/pap293.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Unger:2000:CCA,
  author =       "A. Unger and E. Zehendner and Th. Ungerer",
  title =        "A combined compiler and architecture technique to
                 control multithreaded execution of branches and loop
                 iterations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "53--61",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@InProceedings{Vckovski:2000:MTS,
  author =       "Andrej Vckovski and Jason Brazile",
  title =        "A Multi-Threaded Server for Shared Hash Table Access",
  crossref =     "USENIX:2000:PUT",
  pages =        "??--??",
  year =         "2000",
  bibdate =      "Wed Oct 16 05:17:16 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://db.usenix.org/publications/library/proceedings/tcl2k/vckovski.html",
  acknowledgement = ack-nhfb,
}

@Article{Vishkin:2000:ELR,
  author =       "Dascal Vishkin and Uzi Vishkin",
  title =        "Experiments with list ranking for explicit
                 multi-threaded {(XMT)} instruction parallelism",
  journal =      j-ACM-J-EXP-ALGORITHMICS,
  volume =       "5",
  pages =        "10:1--10:??",
  month =        "????",
  year =         "2000",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/351827.384252",
  ISSN =         "1084-6654",
  bibdate =      "Mon Oct 6 16:03:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Algorithms for the problem of list ranking are
                 empirically studied with respect to the Explicit
                 Multi-Threaded (XMT) platform for instruction-level
                 parallelism (ILP). The main goal of this study is to
                 understand the differences between XMT and more
                 traditional parallel computing implementation
                 platforms/models as they pertain to the well studied
                 list ranking problem. The main two findings are: (i)
                 good speedups for much smaller inputs are possible and
                 (ii) in part, the first finding is based on a new
                 variant of a 1984 algorithm, called the No-Cut
                 algorithm. The paper incorporates analytic
                 (non-asymptotic) performance analysis into experimental
                 performance analysis for relatively small inputs. This
                 provides an interesting example where experimental
                 research and theoretical analysis complement one
                 another. Explicit Multi-Threading (XMT) is a
                 fine-grained computation framework introduced in our
                 SPAA'98 paper. Building on some key ideas of parallel
                 computing, XMT covers the spectrum from algorithms
                 through architecture to implementation; the main
                 implementation related innovation in XMT was through
                 the incorporation of low-overhead hardware and software
                 mechanisms (for more effective fine-grained
                 parallelism). The reader is referred to that paper for
                 detail on these mechanisms. The XMT platform aims at
                 faster single-task completion time by way of ILP.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Experimental Algorithmics (JEA)",
}

@Book{Walmsley:2000:MTP,
  author =       "Mark Walmsley",
  title =        "Multi-threaded programming in {C++}",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 223",
  year =         "2000",
  ISBN =         "1-85233-146-1",
  ISBN-13 =      "978-1-85233-146-7",
  LCCN =         "QA76.73.C153 W3148 2000",
  bibdate =      "Sat Apr 20 11:14:00 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$49.95",
  acknowledgement = ack-nhfb,
}

@Article{Wilson:2000:PBC,
  author =       "Gregory V. Wilson",
  title =        "Programmer's Bookshelf: Classics Old and New",
  journal =      j-DDJ,
  volume =       "25",
  number =       "11",
  pages =        "159--160",
  month =        nov,
  year =         "2000",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Wed Nov 8 15:09:25 MST 2000",
  bibsource =    "http://www.ddj.com/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This month Greg looks at Programming Pearls, Second
                 Edition, by Jon Bentley; Foundations of Multithreaded,
                 Parallel, and Distributing Programming, by Gregory R.
                 Andrews; GUI Bloopers, by Jeff Johnson; The Humane
                 Interface, by Jef Raskin; Legal Battles That Shaped the
                 Software Industry, by Lawrence D. Graham; The World of
                 Scripting Languages, by David Barron; C for Java
                 Programmers, by Tomasz Muldner; and XML Elements of
                 Style, by Simon St. Laurent.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Zhang:2000:WMH,
  author =       "Peter Zhang",
  title =        "{Webrelay}: a Multithreaded {HTTP} Relay Server",
  journal =      j-DDJ,
  volume =       "25",
  number =       "2",
  pages =        "86, 88, 90--94, 96",
  month =        feb,
  year =         "2000",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Nov 9 08:25:13 MST 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2000/2000_02/webrelay.txt;
                 http://www.ddj.com/ftp/2000/2000_02/webrelay.zip",
  abstract =     "Webrelay is a freely available multithreaded HTTP
                 relay server that authenticates that clients are
                 legitimate users before they are connected to vendor
                 web servers. Additional resources include webrelay.txt
                 (listings) and webrelay.zip (source code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Anonymous:2001:ESM,
  author =       "Anonymous",
  title =        "Errata: {``Speculative Multithreaded Processors''}",
  journal =      j-COMPUTER,
  volume =       "34",
  number =       "5",
  pages =        "7--7",
  month =        may,
  year =         "2001",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Fri May 4 17:53:39 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "See \cite{Sohi:2001:SMP}.",
  URL =          "http://dlib.computer.org/co/books/co2001/pdf/r5004.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Antoniu:2001:CMJ,
  author =       "Gabriel Antoniu and Luc Boug{\'e} and Philip Hatcher
                 and Mark MacBeth and Keith McGuigan and Raymond
                 Namyst",
  title =        "Compiling Multithreaded {Java} Bytecode for
                 Distributed Execution (Distinguished Paper)",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "1039--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19001039.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19001039.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Antoniu:2001:DPP,
  author =       "Gabriel Antoniu and Luc Boug{\'e}",
  title =        "{DSM-PM2}: a Portable Implementation Platform for
                 Multithreaded {DSM} Consistency Protocols",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2026",
  pages =        "55--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:43 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2026.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2026/20260055.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2026/20260055.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Antoniu:2001:HSC,
  author =       "Gabriel Antoniu and others",
  title =        "The {Hyperion} system: {Compiling} multithreaded
                 {Java} bytecode for distributed execution",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "27",
  number =       "10",
  pages =        "1279--1297",
  month =        sep,
  year =         "2001",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 22 16:52:42 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/35/21/47/40/27/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/47/40/27/article.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Attali:2001:GVJ,
  author =       "Isabelle Attali and Denis Caromel and Marjorie Russo",
  title =        "Graphical Visualization of {Java} Objects, Threads,
                 and Locks",
  journal =      j-IEEE-DISTRIB-SYST-ONLINE,
  volume =       "2",
  number =       "1",
  year =         "2001",
  ISSN =         "1541-4922 (print), 1558-1683 (electronic)",
  ISSN-L =       "1541-4922",
  bibdate =      "Wed Oct 23 17:47:56 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dsonline.computer.org/0101/features/att0101_print.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Distributed Systems Online",
}

@Article{Ball:2001:PVM,
  author =       "Thomas Ball and Sagar Chaki and Sriram K. Rajamani",
  title =        "Parameterized Verification of Multithreaded Software
                 Libraries",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2031",
  pages =        "158--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:48 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2031.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2031/20310158.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2031/20310158.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Becker:2001:SMW,
  author =       "Thomas Becker",
  title =        "Synchronization Monitors For {Win32}",
  journal =      j-DDJ,
  volume =       "26",
  number =       "12",
  pages =        "46, 48, 50--52, 54",
  month =        dec,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Feb 12 05:21:41 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2001/2001_12/monitor.txt;
                 http://www.ddj.com/ftp/2001/2001_12/monitor.zip",
  abstract =     "Thomas presents a Java-style synchronization monitor
                 for multithreaded Win32 development. Additional
                 resources include {\tt monitor.txt} (listings) and {\tt
                 monitor.zip} (source code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Broberg:2001:POU,
  author =       "Magnus Broberg and Lars Lundberg and H{\aa}kan Grahn",
  title =        "Performance Optimization Using Extended Critical Path
                 Analysis in Multithreaded Programs on Multiprocessors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "61",
  number =       "1",
  pages =        "115--136",
  day =          "1",
  month =        jan,
  year =         "2001",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.2000.1667",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Feb 22 15:30:35 MST 2002",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Brunst:2001:GBP,
  author =       "Holger Brunst and Wolfgang E. Nagel and Hans-Christian
                 Hoppe",
  title =        "Group-Based Performance Analysis for Multithreaded
                 {SMP} Cluster Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2150",
  pages =        "148--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:53 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500148.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2150/21500148.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Bull:2001:MSO,
  author =       "J. Mark Bull and Darragh O'Neill",
  title =        "A microbenchmark suite for {OpenMP 2.0}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "41--48",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Book{Chandra:2001:PPO,
  author =       "Rohit Chandra and Leonardo Dagum and David Kohr and
                 Dror Maydan and Jeff McDonald and Ramesh Menon",
  title =        "Parallel Programming in {OpenMP}",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xvi + 230",
  year =         "2001",
  ISBN =         "1-55860-671-8",
  ISBN-13 =      "978-1-55860-671-5",
  LCCN =         "QA76.642 .P38 2001",
  bibdate =      "Thu Jul 14 11:09:17 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  price =        "US\$39.95",
  URL =          "http://www.mkp.com/books_catalog/catalog.asp?ISBN=1-55860-671-8",
  abstract =     "The rapid and widespread acceptance of shared memory
                 multiprocessor architectures has created a pressing
                 demand for an efficient way to program these systems.
                 At the same time, developers of technical and
                 scientific applications in industry and in government
                 laboratories find they need to parallelize huge volumes
                 of code in a portable fashion. OpenMP, developed
                 jointly by several parallel computing vendors to
                 address these issues, is an industry-wide standard for
                 programming shared-memory and distributed shared-memory
                 multiprocessors. It consists of a set of compiler
                 directives and library routines that extend FORTRAN, C,
                 and C++ codes to express shared-memory parallelism.
                 Parallel Programming in OpenMP is the first book to
                 teach both the novice and expert parallel programmers
                 how to program using this new standard. The authors,
                 who helped design and implement OpenMP while at SGI,
                 bring a depth and breadth to the book as compiler
                 writers, application developers, and performance
                 engineers.",
  acknowledgement = ack-nhfb,
  keywords =     "parallel programming (computer science)",
  tableofcontents = "Foreword \\
                 Preface \\
                 1: Introduction \\
                 Performance with OpenMP \\
                 A first glimpse of OpenMP \\
                 The OpenMP parallel computer \\
                 Why OpenMP \\
                 History of OpenMP \\
                 Navigating the rest of the book \\
                 2: Getting started with OpenMP \\
                 3: Exploiting loop-level parallelism \\
                 Meaning of the parallel do directive \\
                 Controlling data sharing \\
                 Removing data dependences \\
                 Enhancing performance \\
                 4: Beyond loop-level parallelism, parallel regions \\
                 5: Synchronization \\
                 6: Performance",
}

@Article{ChassindeKergommeaux:2001:PEE,
  author =       "Jacques {Chassin de Kergommeaux} and Benhur de
                 Oliveira Stein",
  title =        "Paj{\'e}: An Extensible Environment for Visualizing
                 Multi-threaded Programs Executions",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "133--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000133.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19000133.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Christiaens:2001:JRR,
  author =       "Mark Christiaens",
  title =        "{JaRec}: Record\slash Replay for Multi-threaded {Java}
                 Programs",
  crossref =     "USENIX:2001:PJV",
  pages =        "??--??",
  year =         "2001",
  bibdate =      "Tue Oct 15 17:45:19 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/jvm01/JVM_wips/S07.pdf",
  acknowledgement = ack-nhfb,
}

@Article{Duncan:2001:LPD,
  author =       "Ray Duncan and Duncan Harris and Douglas Reilly and
                 Craig Rodrigues and Michael Birken and Paul S. Person",
  title =        "Letters: Plug-in Desupport; Threading and the {.Net}
                 Framework; {CORBA} Interoperability; Game Over for
                 {Java}; Totally Wired",
  journal =      j-DDJ,
  volume =       "26",
  number =       "11",
  pages =        "10, 12",
  month =        nov,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Feb 12 05:21:40 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@InProceedings{Edelstein:2001:MJP,
  author =       "Orit Edelstein and Eitan Farchi and Yarden Nir and Gil
                 Ratsaby and Shmuel Ur",
  title =        "Multithreaded {Java} Program Test Generation",
  crossref =     "ACM:2001:PAJ",
  pages =        "181--181",
  year =         "2001",
  bibdate =      "Mon May 06 09:31:01 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.philippsen.com/JGI2001/camerareadyabstracts/18.html;
                 http://www.philippsen.com/JGI2001/finalpapers/18500181.ps",
  acknowledgement = ack-nhfb,
  keywords =     "Java",
}

@Article{Elwasif:2001:AMT,
  author =       "Wael R. Elwasif and David E. Bernholdt and James A.
                 Kohl and G. A. Geist",
  title =        "An Architecture for a Multi-threaded Harness Kernel",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2131",
  pages =        "126--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Fri Feb 1 08:13:55 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310126.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2131/21310126.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Evripidou:2001:MDD,
  author =       "Paraskevas Evripidou",
  title =        "{$ D^3 $-Machine}: a decoupled data-driven
                 multithreaded architecture with variable resolution
                 support",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "27",
  number =       "9",
  pages =        "1197--1225",
  month =        aug,
  year =         "2001",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Jul 18 06:31:16 MDT 2001",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/article.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Figueiredo:2001:IPH,
  author =       "Renato J. O. Figueiredo and Jeffrey P. Bradford and
                 Jos{\'e} A. B. Fortes",
  title =        "Improving the Performance of Heterogeneous {DSMs} via
                 Multithreading",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1981",
  pages =        "168--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:03:02 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1981.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1981/19810168.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1981/19810168.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Garber:2001:NBT,
  author =       "Lee Garber",
  title =        "News Briefs: Is Tech Downturn Changing Education and
                 Employment Trends; {HTMT} Promises High-Performance
                 Computing; Controversial Software Law [{UCITA}] Hist
                 Resistance",
  journal =      j-COMPUTER,
  volume =       "34",
  number =       "10",
  pages =        "19--21",
  month =        oct,
  year =         "2001",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Fri Feb 8 07:11:46 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/co/books/co2001/pdf/rx019.pdf;
                 http://www.computer.org/computer/co2001/rx019abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
  keywords =     "hybrid technology multithreaded architecture (HTMT);
                 Uniform Computer Information Transactions Act (UCITA)",
}

@Article{Geiselbrecht:2001:NOS,
  author =       "Travis K. Geiselbrecht",
  title =        "The {NewOS} Operating System",
  journal =      j-DDJ,
  volume =       "26",
  number =       "12",
  pages =        "33, 35, 38, 40, 42, 44",
  month =        dec,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Feb 12 05:21:41 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "See correction \cite{Editors:2002:LUC}.",
  URL =          "http://www.ddj.com/ftp/2001/2001_12/newos.txt;
                 http://www.ddj.com/ftp/2001/2001_12/newos.zip",
  abstract =     "NewOS is a freely available lightweight operating
                 system written in C for platforms ranging from Intel-
                 and AMD-based PCs to the Sega Dreamcast. Additional
                 resources include {\tt newos.txt} (listings) and {\tt
                 newos.zip} (source code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Goeschl:2001:JTT,
  author =       "Siegfried Goeschl",
  title =        "The {JUnit++} Testing Tool",
  journal =      j-DDJ,
  volume =       "26",
  number =       "2",
  pages =        "34, 36--38",
  month =        feb,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Feb 15 12:14:41 MST 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2001/2001_02/junitpp.txt;
                 http://www.ddj.com/ftp/2001/2001_02/junitpp.zip",
  abstract =     "JUnit++ is a freely available Java unit test framework
                 that includes a test data repository, command-line
                 arguments, and a TestRunner class that supports a
                 built-in repetition counter and multithreading at the
                 command line. Additional resources include junitpp.txt
                 (listings) and junitpp.zip (source code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@InProceedings{Hanson:2001:UFI,
  author =       "Richard J. Hanson and Clay P. Breshears and Henry A.
                 Gabb",
  title =        "Using a {Fortran} Interface to {POSIX} Threads",
  crossref =     "Boisvert:2001:ASS",
  pages =        "257--272",
  year =         "2001",
  bibdate =      "Sat Dec 29 09:54:37 2007",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Huber:2001:EFC,
  author =       "Andreas Huber",
  title =        "Elegant Function Call Wrappers",
  journal =      j-CCCUJ,
  volume =       "19",
  number =       "5",
  pages =        "8--??",
  month =        may,
  year =         "2001",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:31 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2001/0105/0105toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Scheduling functions for later execution is an obvious
                 requirement in multithreaded programs. How to do that
                 and preserve both type safety and modularity is not so
                 obvious. The author combines an old pattern and some
                 new template techniques to pull it off rather nicely.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Ishihara:2001:CCP,
  author =       "Takashi Ishihara and Tiejun Li and Eugene F. Fodor and
                 Ronald A. Olsson",
  title =        "A Comparison of Concurrent Programming and Cooperative
                 Multithreading",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "729--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000729.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19000729.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Iwama:2001:ICB,
  author =       "Chitaka Iwama and Niko Demus Barli and Shuichi Sakai
                 and Hidehiko Tanaka",
  title =        "Improving Conditional Branch Prediction on Speculative
                 Multithreading Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2150",
  pages =        "413--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:53 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500413.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2150/21500413.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Iwata:2001:PMT,
  author =       "Kazunori Iwata and Shingo Itabashi and Naohiro Ishii",
  title =        "A Protocol for Multi-Threaded Processes with Choice in
                 $ \pi $-Calculus",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2074",
  pages =        "138--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:04:30 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2074.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2074/20740138.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2074/20740138.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Kakulavarapu:2001:DLB,
  author =       "P. Kakulavarapu and O. C. Maquelin and J. N. Amaral
                 and G. R. Gao",
  title =        "Dynamic Load Balancers for a Multithreaded
                 Multiprocessor System",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "11",
  number =       "1",
  pages =        "169--??",
  month =        mar,
  year =         "2001",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Sat Feb 23 19:27:51 MST 2002",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/ppl.shtml;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Kienzle:2001:CTT,
  author =       "J{\"o}rg Kienzle and Alexander Romanovsky",
  title =        "Combining tasking and transactions, part {II}: open
                 multithreaded transactions",
  journal =      j-SIGADA-LETTERS,
  volume =       "21",
  number =       "1",
  pages =        "67--74",
  month =        mar,
  year =         "2001",
  CODEN =        "AALEE5",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Sat Aug 9 09:06:10 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGAda Ada Letters",
}

@Article{Kienzle:2001:IEO,
  author =       "J{\"o}rg Kienzle and Alexander Romanovsky",
  title =        "Implementing exceptions in open multithreaded
                 transactions based on {Ada 95} exceptions",
  journal =      j-SIGADA-LETTERS,
  volume =       "21",
  number =       "3",
  pages =        "57--63",
  month =        sep,
  year =         "2001",
  CODEN =        "AALEE5",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Sat Aug 9 09:06:11 MDT 2003",
  bibsource =    "http://www.acm.org/sigada/ada_letters/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGAda Ada Letters",
}

@Article{Legrand:2001:MTD,
  author =       "Iosif {Legrand, on behalf of the MONARC
                 Collaboration}",
  title =        "Multi-threaded, discrete event simulation of
                 distributed computing systems",
  journal =      j-COMP-PHYS-COMM,
  volume =       "140",
  number =       "1--2",
  pages =        "274--285",
  day =          "15",
  month =        oct,
  year =         "2001",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/S0010-4655(01)00281-8",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:41:04 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465501002818",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Lopes:2001:FGM,
  author =       "L. Lopes and V. T. Vasconcelos and F. Silva",
  title =        "Fine-grained multithreading with process calculi",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "50",
  number =       "8",
  pages =        "852--862",
  month =        aug,
  year =         "2001",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.947014",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Jul 5 10:03:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=947014",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Luk:2001:TML,
  author =       "Chi-Keung Luk",
  title =        "Tolerating memory latency through software-controlled
                 pre-execution in simultaneous multithreading
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "40--51",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@InProceedings{Manson:2001:CSM,
  author =       "Jeremy Manson and William Pugh",
  title =        "Core Semantics of Multithreaded {Java}",
  crossref =     "ACM:2001:PAJ",
  pages =        "29--38",
  year =         "2001",
  bibdate =      "Mon May 06 09:31:01 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.philippsen.com/JGI2001/camerareadyabstracts/42.html;
                 http://www.philippsen.com/JGI2001/finalpapers/18500029.pdf",
  acknowledgement = ack-nhfb,
  keywords =     "Java",
}

@Book{Mauro:2001:SIC,
  author =       "Jim Mauro and Richard McDougall",
  title =        "{Solaris} Internals: Core Kernel Architecture",
  publisher =    pub-SUN-MICROSYSTEMS-PRESS,
  address =      pub-SUN-MICROSYSTEMS-PRESS:adr,
  pages =        "xli + 657",
  year =         "2001",
  ISBN =         "0-13-022496-0",
  ISBN-13 =      "978-0-13-022496-5",
  LCCN =         "QA76.76.O63 M37195 2001",
  bibdate =      "Fri Apr 11 16:56:49 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  series =       "Sun BluePrints Program",
  URL =          "http://www.sun.com/books/catalog/mauro/index.html",
  acknowledgement = ack-nhfb,
  libnote =      "Not in my library.",
  shorttableofcontents = "The Solaris Memory System \\
                 Threads, Processes, and IPC \\
                 Files and File Systems \\
                 Kernel Tunables, Switches, and Limits \\
                 Kernel Virtual Address Maps",
  tableofcontents = "List of Header Files \\
                 Part 1: Introduction to Solaris Internals \\
                 1: An Introduction to Solaris \\
                 2: Kernel Services \\
                 3: Kernel Synchronization Primitives \\
                 4: Kernel Bootstrap and Initialization \\
                 Part 2: The Solaris Memory System \\
                 5: Solaris Memory Architecture \\
                 6: Kernel Memory \\
                 7: Memory Monitoring \\
                 Part 3: Threads, Processes, and IPC \\
                 8: The Solaris Multithreaded Process Architecture \\
                 9: The Solaris Kernel dispatcher \\
                 10: Interprocess Communication \\
                 Part 4: Files and File Systems \\
                 11: Solaris Files and File I/O \\
                 12: File System Overview \\
                 13: File System Framework \\
                 14: The UNIX File System \\
                 15: Solaris File System Cache \\
                 Appendix A: Kernel Tunables, Switches, and Limits \\
                 Appendix B: Kernel Virtual Address Maps \\
                 Appendix C: A Sample Profs Utility",
}

@Article{Nagle:2001:MFV,
  author =       "Dan Nagle",
  title =        "Multithreading, {Fthreads}, and {Visual Fortran}",
  journal =      j-DDJ,
  volume =       "26",
  number =       "7",
  pages =        "36, 38, 40",
  month =        jul,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jun 7 06:07:17 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2001/2001_07/fthreads.zip",
  abstract =     "Dan presents a Fortran module that helps you write
                 multithreaded programs for Windows-based applications.
                 Additional resources include fthreads.zip (source
                 code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Nakhimovsky:2001:ISM,
  author =       "Greg Nakhimovsky",
  title =        "Improving Scalability Of Multithreaded Dynamic Memory
                 Allocation",
  journal =      j-DDJ,
  volume =       "26",
  number =       "7",
  pages =        "44, 46, 48--50, 52, 54",
  month =        jul,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jun 7 06:07:17 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2001/2001_07/mthot.txt;
                 http://www.ddj.com/ftp/2001/2001_07/mthot.zip",
  abstract =     "Multiprocessor/multithreaded environments add a new
                 dimension to the familiar malloc facility. The
                 ``MT-hot'' implementation Greg presents here lets
                 multiple threads execute in parallel without major
                 delays. Additional resources include mthot.txt
                 (listings) and mthot.zip (source code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Nikolopoulos:2001:EMA,
  author =       "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e}
                 and J. Labarta",
  title =        "Exploiting memory affinity in {OpenMP} through
                 schedule reuse",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "49--55",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Ozer:2001:WMT,
  author =       "Emre {\"O}zer and Thomas M. Conte and Saurabh Sharma",
  title =        "Weld: a Multithreading Technique Towards
                 Latency-Tolerant {VLIW} Processors",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2228",
  pages =        "192--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:07:14 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2228.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2228/22280192.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2228/22280192.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Pang:2001:PSR,
  author =       "James Pang and Gholamali Shoja and Eric Manning",
  title =        "Providing Soft Real-time {QoS} Guarantees for {Java}
                 Threads",
  crossref =     "ACM:2001:PAJ",
  pages =        "39--46",
  year =         "2001",
  bibdate =      "Mon May 06 09:31:01 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.philippsen.com/JGI2001/camerareadyabstracts/21.html;
                 http://www.philippsen.com/JGI2001/finalpapers/18500039.pdf",
  acknowledgement = ack-nhfb,
  keywords =     "Java",
}

@Article{Parcerisa:2001:ILT,
  author =       "J.-M. Parcerisa and A. Gonzalez",
  title =        "Improving latency tolerance of multithreading through
                 decoupling",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "50",
  number =       "10",
  pages =        "1084--1094",
  month =        oct,
  year =         "2001",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/12.956093",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Jul 5 10:03:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=956093",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Plakal:2001:CGC,
  author =       "Manoj Plakal and Charles N. Fischer",
  title =        "Concurrent Garbage Collection Using Program Slices on
                 Multithreaded Processors",
  journal =      j-SIGPLAN,
  volume =       "36",
  number =       "1",
  pages =        "94--100",
  month =        jan,
  year =         "2001",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:22 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "ACM SIGPLAN International Symposium on Memory
                 Management (ISMM'00)",
}

@Article{Protopopov:2001:MMP,
  author =       "Boris V. Protopopov and Anthony Skjellum",
  title =        "A Multithreaded {Message Passing Interface (MPI)}
                 Architecture: Performance and Program Issues",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "61",
  number =       "4",
  pages =        "449--466",
  day =          "1",
  month =        apr,
  year =         "2001",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.2000.1674",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Feb 22 15:30:36 MST 2002",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Pyarali:2001:EOT,
  author =       "Irfan Pyarali and Marina Spivak and Ron Cytron and
                 Douglas C. Schmidt",
  title =        "Evaluating and Optimizing Thread Pool Strategies for
                 Real-Time {CORBA}",
  journal =      j-SIGPLAN,
  volume =       "36",
  number =       "8",
  pages =        "214--222",
  month =        aug,
  year =         "2001",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:29 MST 2003",
  bibsource =    "http://www.cs.wisc.edu/~bodik/om2001/program.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "",
  acknowledgement = ack-nhfb,
  annote =       "OM'01: The First Workshop on Optimization of
                 Middleware and Distributed Systems",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Reilly:2001:TNF,
  author =       "Douglas Reilly",
  title =        "Threading and the {.Net} Framework",
  journal =      j-DDJ,
  volume =       "26",
  number =       "8",
  pages =        "30, 32--33, 36, 38",
  month =        aug,
  year =         "2001",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Wed Jul 11 06:31:35 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2001/2001_08/thrednet.txt",
  abstract =     "Microsoft's .NET Framework offers a number of
                 features, such as threading, that simplify difficult
                 tasks. Additional resources include thrednet.txt
                 (listings).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Rinard:2001:AMP,
  author =       "Martin Rinard",
  title =        "Analysis of Multithreaded Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2126",
  pages =        "1--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:28 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2126.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2126/21260001.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2126/21260001.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Roh:2001:RMD,
  author =       "Lucas Roh and Bhanu Shankar and Wim B{\"o}hm and Walid
                 Najjar",
  title =        "Resource Management in Dataflow-Based Multithreaded
                 Execution",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "61",
  number =       "5",
  pages =        "581--608",
  day =          "1",
  month =        may,
  year =         "2001",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.2001.1708",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Feb 22 15:30:37 MST 2002",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708/ref",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Salcianu:2001:PEA,
  author =       "Alexandru Salcianu and Martin Rinard",
  title =        "Pointer and escape analysis for multithreaded
                 programs",
  journal =      j-SIGPLAN,
  volume =       "36",
  number =       "7",
  pages =        "12--23",
  month =        jul,
  year =         "2001",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:28 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/series/ppopp/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/articles/proceedings/ppopp/379539/p12-salcianu/p12-salcianu.pdf;
                 http://www.acm.org/pubs/citations/proceedings/ppopp/379539/p12-salcianu/",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Sigmund:2001:SCS,
  author =       "U. Sigmund and T. Ungerer",
  title =        "On Speculation Control in Simultaneous Multithreaded
                 Processors",
  journal =      j-J-UCS,
  volume =       "7",
  number =       "9",
  pages =        "848--868",
  day =          "28",
  month =        sep,
  year =         "2001",
  CODEN =        "????",
  ISSN =         "0948-695X (print), 0948-6968 (electronic)",
  ISSN-L =       "0948-6968",
  bibdate =      "Wed Feb 20 07:23:10 MST 2002",
  bibsource =    "http://www.jucs.org/jucs;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.jucs.org/jucs_7_9/on_speculation_control_in",
  acknowledgement = ack-nhfb,
  fjournal =     "J.UCS: Journal of Universal Computer Science",
  journal-URL =  "http://www.jucs.org/jucs",
}

@Article{Smith:2001:CMM,
  author =       "Burton Smith",
  title =        "{Cray MTA}: Multithreading for Latency Response",
  journal =      j-COMPUTER,
  volume =       "34",
  number =       "4",
  pages =        "69--69",
  month =        apr,
  year =         "2001",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Sat Apr 7 07:21:35 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/co/books/co2001/pdf/r4059.pdf;
                 http://www.computer.org/computer/co2001/r4059abs.htm",
  acknowledgement = ack-nhfb,
  annote =       "Describes the Cray MTA system, which has up to 256
                 multithreaded processors. There are no data caches:
                 instead, each processor switches context every cycle
                 among up to 128 instruction streams, and each stream
                 can have up to eight outstanding memory references, so
                 memory latency up to 1024 cycles does not delay
                 processing.",
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Sohi:2001:SMP,
  author =       "Gurindar S. Sohi and Amir Roth",
  title =        "Speculative Multithreaded Processors",
  journal =      j-COMPUTER,
  volume =       "34",
  number =       "4",
  pages =        "66--73",
  month =        apr,
  year =         "2001",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Sat Apr 7 07:21:35 MDT 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "See errata \cite{Anonymous:2001:ESM}.",
  URL =          "http://dlib.computer.org/co/books/co2001/pdf/r4066.pdf;
                 http://www.computer.org/computer/co2001/r4066abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Sohn:2001:CTC,
  author =       "Andrew Sohn and Yuetsu Kodama and Jui-Yuan Ku and
                 Mitsuhisa Sato and Yoshinori Yamaguchi",
  title =        "Chapter 15. {Tolerating} Communication Latency through
                 Dynamic Thread Invocation in a Multithreaded
                 Architecture",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1808",
  pages =        "525--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:34 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1808.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1808/18080525.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1808/18080525.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Steensgaard:2001:TSH,
  author =       "Bjarne Steensgaard",
  title =        "Thread-Specific Heaps for Multi-Threaded Programs",
  journal =      j-SIGPLAN,
  volume =       "36",
  number =       "1",
  pages =        "18--24",
  month =        jan,
  year =         "2001",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sun Dec 14 09:18:22 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "ACM SIGPLAN International Symposium on Memory
                 Management (ISMM'00)",
}

@Article{Sung:2001:MDA,
  author =       "Michael Sung and Ronny Krashinsky and Krste
                 Asanovi{\'c}",
  title =        "Multithreading decoupled architectures for
                 complexity-effective general purpose computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "56--61",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Theobald:2001:DCI,
  author =       "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal
                 and Gerd Heber and Ruppa K. Thulasiram and Guang R.
                 Gao",
  title =        "Developing a Communication Intensive Application on
                 the {EARTH} Multithreaded Architecture (Distinguished
                 Paper)",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1900",
  pages =        "625--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:02:44 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000625.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/1900/19000625.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Zoppetti:2001:IDD,
  author =       "Gary Zoppetti and Gagan Agrawal and Rishi Kumar",
  title =        "Impact of Data Distribution on Performance of
                 Irregular Reductions on Multithreaded Architectures",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2110",
  pages =        "483--??",
  year =         "2001",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Feb 2 13:05:11 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2110.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2110/21100483.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2110/21100483.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Abraham-Mumm:2002:VJR,
  author =       "Erika {\'A}brah{\'a}m-Mumm and Frank S. de Boer and
                 Willem-Paul de Roever and Martin Steffen",
  title =        "Verification for {Java}'s Reentrant Multithreading
                 Concept",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2303",
  pages =        "5--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:21 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2303.htm;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2303/23030005.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2303/23030005.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Adiletta:2002:NGI,
  author =       "Matthew Adiletta and Mark Rosenbluth and Debra
                 Bernstein and Gilbert Wolrich and Hugh Wilkinson",
  title =        "The Next Generation of {Intel IXP} Network
                 Processors",
  journal =      j-INTEL-TECH-J,
  volume =       "6",
  number =       "3",
  pages =        "6--18",
  day =          "15",
  month =        aug,
  year =         "2002",
  ISSN =         "1535-766X",
  bibdate =      "Sun Nov 17 11:06:06 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/p01_abstract.htm;
                 http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/vol6iss3_art01.pdf",
  keywords =     "10Gb/s; ATM; communication architecture; Ethernet;
                 IXP; microprocessor architecture; multi-processors;
                 multi-service switches; multi-threading; network
                 processors; OC-192; OC-48; routing; switching",
}

@Article{Adiletta:2002:PSA,
  author =       "Matthew Adiletta and Donald Hooper and Myles Wilde",
  title =        "Packet over {SONET}: Achieving 10 {Gigabit}/sec Packet
                 Processing with an {IXP2800}",
  journal =      j-INTEL-TECH-J,
  volume =       "6",
  number =       "3",
  pages =        "29--39",
  day =          "15",
  month =        aug,
  year =         "2002",
  ISSN =         "1535-766X",
  bibdate =      "Sun Nov 17 11:06:06 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/p01_abstract.htm;
                 http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/vol6iss3_art05.pdf",
  keywords =     "10Gbs; ATM; communication architecture; Ethernet;
                 hardware-based multi-threading; IXP; microprocessor
                 architecture; multi-processors; multi-service switches;
                 network processors; OC-192; OC-48; routing; switching",
}

@Article{Anonymous:2002:ST,
  author =       "Anonymous",
  title =        "Speculative threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "??--??",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Antoniu:2002:IMP,
  author =       "Gabriel Antoniu and Luc Boug{\'e}",
  title =        "Implementing Multithreaded Protocols for Release
                 Consistency on Top of the Generic {DSM}-{PM} Platform",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2326",
  pages =        "179--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:32 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2326.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2326/23260179.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2326/23260179.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Baldwin:2002:LMF,
  author =       "John H. Baldwin",
  title =        "Locking in the Multithreaded {FreeBSD} Kernel",
  crossref =     "USENIX:2002:PBF",
  pages =        "27--35",
  year =         "2002",
  bibdate =      "Tue Oct 15 12:37:27 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/bsdcon02/baldwin.html",
  acknowledgement = ack-nhfb,
}

@Article{Balis:2002:CPM,
  author =       "B. Balis and M. Bubak and W. Funika and R.
                 Wism{\"u}ller",
  title =        "A Concept of Portable Monitoring of Multithreaded
                 Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2330",
  pages =        "884--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:35 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2330.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2330/23300884.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2330/23300884.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Boudol:2002:NCP,
  author =       "G{\'e}rard Boudol and Ilaria Castellani",
  title =        "Noninterference for concurrent programs and thread
                 systems",
  journal =      j-THEOR-COMP-SCI,
  volume =       "281",
  number =       "1-2",
  pages =        "109--130",
  month =        may,
  year =         "2002",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Wed Nov 20 18:08:56 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@Article{Bouge:2002:IRE,
  author =       "L. Boug{\'e} and V. Danjean and R. Namyst",
  title =        "Improving Reactivity to {I/O} Events in Multithreaded
                 Environments Using a Uniform, Scheduler-Centric {API}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2400",
  pages =        "605--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Thu Sep 12 08:40:04 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000605.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2400/24000605.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Brebner:2002:MLC,
  author =       "Gordon Brebner",
  title =        "Multithreading for Logic-Centric Systems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2438",
  pages =        "5--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:10:28 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2438.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2438/24380005.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2438/24380005.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@PhdThesis{Callaway:2002:VTR,
  author =       "John Callaway",
  title =        "Visualization of threads in a running {Java} program",
  type =         "Thesis ({M.S.})",
  school =       "University of California, Santa Cruz",
  address =      "Santa Cruz, CA, USA",
  year =         "2002",
  LCCN =         "QA76.73.J38 C36 2002",
  bibdate =      "Tue May 6 05:26:58 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "academic dissertations -- University of California,
                 Santa Cruz -- 2002; academic dissertations --
                 University of California, Santa Cruz -- computer;
                 computer science; computer software -- development;
                 Java (computer program language); object-oriented
                 programming (computer science); science; software
                 engineering; visualization",
}

@Article{Carothers:2002:CMP,
  author =       "Christopher D. Carothers and Boleslaw K. Szymanski",
  title =        "Checkpointing Multithreaded Programs",
  journal =      j-DDJ,
  volume =       "27",
  number =       "8",
  pages =        "??--??",
  month =        aug,
  year =         "2002",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Fri Sep 13 06:15:52 MDT 2002",
  bibsource =    "http://www.ddj.com/articles/2002/0208/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2002/2002_08/checkpt.txt",
  abstract =     "Checkpointing is the process by which you grab
                 snapshots of running programs. Additional resources
                 include checkpt.txt (listings).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Cazals:2002:NID,
  author =       "Fr{\'e}d{\'e}ric Cazals",
  title =        "Non-Intrusive Debugging and Incremental Visualization
                 with the Geometric Stethoscope",
  journal =      j-J-GRAPHICS-TOOLS,
  volume =       "7",
  number =       "2",
  pages =        "27--40",
  year =         "2002",
  CODEN =        "JGTOFD",
  ISSN =         "1086-7651",
  bibdate =      "Tue Dec 16 13:47:48 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/jgt/papers/Cazals02/",
  abstract =     "Developing and debugging geometric applications is
                 known to be a difficult task: The calculations and data
                 structures can be involved, there are degenerate cases
                 and numerical issues, etc. THis paper presents a
                 software setup aiming at easing the development, the
                 debugging, ad the maintenance of geometric
                 applications. \par

                 More precisely, {\em incremental visualization\/} is
                 defined as the possibility for the programmer to
                 visualize interactively any significant update of the
                 geometric data structures at any time. {\em
                 Non-intrusive debugging\/} is defined as the
                 possibility of visualizing any geometric entity in
                 three dimensions from a standard debugger at any time
                 without modifying the source code. We present a setup
                 to perform incremental visualization and non-intrusive
                 debugging. This setup is based on multithreading and
                 requires a three-dimensional viewer, such as Open
                 Inventor, Vtk, or Geomview, and a standard debugger
                 (dbx or gdb). \par

                 An Open Inventor based C++ implementation of this setup
                 accompanies this paper. Using it simply requires
                 writing the functions converting the user's data
                 structures into Open Inventor's data structures. The
                 setup could easily be extended to accommodate other
                 medias such as sound, video, etc.",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Graphics Tools: JGT",
  journal-URL =  "http://www.tandfonline.com/loi/ujgt20",
}

@Article{Chappell:2002:DPB,
  author =       "Robert S. Chappell and Francis Tseng and Adi Yoaz and
                 Yale N. Patt",
  title =        "Difficult-path branch prediction using subordinate
                 microthreads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "307--317",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Chaudhry:2002:PTS,
  author =       "Puneesh Chaudhry",
  title =        "A Per-Thread Singleton Class",
  journal =      j-CCCUJ,
  volume =       "20",
  number =       "5",
  pages =        "14--??",
  month =        may,
  year =         "2002",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:36 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A refreshing look at an old pattern.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Choi:2002:EPD,
  author =       "Jong-Deok Choi and Keunwoo Lee and Alexey Loginov and
                 Robert O'Callahan and Vivek Sarkar and Manu Sridharan",
  title =        "Efficient and precise datarace detection for
                 multithreaded object-oriented programs",
  journal =      j-SIGPLAN,
  volume =       "37",
  number =       "5",
  pages =        "258--269",
  month =        may,
  year =         "2002",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 15 12:23:02 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Choi:2002:IFI,
  author =       "Jong-Deok Choi and Andreas Zeller",
  title =        "Isolating failure-inducing thread schedules",
  journal =      j-SIGSOFT,
  volume =       "27",
  number =       "4",
  pages =        "210--220",
  month =        jul,
  year =         "2002",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/566171.566211",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:14:20 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib",
  abstract =     "Consider a multi-threaded application that
                 occasionally fails due to non-determinism. Using the
                 DEJAVU capture/replay tool, it is possible to record
                 the thread schedule and replay the application in a
                 deterministic way. By systematically narrowing down the
                 difference between a thread schedule that makes the
                 program pass and another schedule that makes the
                 program fail, the Delta Debugging approach can pinpoint
                 the error location automatically---namely, the
                 location(s) where a thread switch causes the program to
                 fail. In a case study, Delta Debugging isolated the
                 failure-inducing schedule difference from 3.8 billion
                 differences in only 50 tests.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Clark:2002:AMT,
  author =       "Keith Clark and Peter J. Robinson",
  title =        "Agents as Multi-threaded Logical Objects",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2407",
  pages =        "33--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:10:17 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2407.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2407/24070033.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2407/24070033.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Cook:2002:REJ,
  author =       "Jonathan J. Cook",
  title =        "Reverse Execution of {Java} Bytecode",
  journal =      j-COMP-J,
  volume =       "45",
  number =       "6",
  pages =        "608--619",
  month =        "????",
  year =         "2002",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/45.6.608",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Wed Nov 6 11:21:54 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compj2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/",
  URL =          "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/450608.sgm.abs.html;
                 http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/pdf/450608.pdf",
  abstract =     "We demonstrate a model, including operational
                 semantics, for the reverse execution of stack-based
                 code. We discuss our modification of the Kaffe
                 implementation of the Java Virtual Machine, supporting
                 a debugger capable of running Java bytecode backwards.
                 We achieve reverse execution by logging the state lost
                 during each operation or by directly reversing
                 instructions. Our debugger has facilities for stepping,
                 stepping over methods and running to breakpoints, in
                 both directions. Multi-threading is supported. It is
                 also possible to step through the bytecode when the
                 Java source code is not available. The debugger has
                 both a command line user interface and a graphical user
                 interface with facilities for editing code and running
                 the Java compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
}

@Article{Delzanno:2002:TAV,
  author =       "Giorgio Delzanno and Jean-Fran{\c{c}}ois Raskin and
                 Laurent {Van Begin}",
  title =        "Towards the Automated Verification of Multithreaded
                 {Java} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2280",
  pages =        "173--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 10 19:09:09 MDT 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2280.htm;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer-ny.com/link/service/series/0558/bibs/2280/22800173.htm;
                 http://link.springer-ny.com/link/service/series/0558/papers/2280/22800173.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@InProceedings{Ding:2002:MOP,
  author =       "Yun He and Chris H. Q. Ding",
  key =          "multidimensional arrays; index reshuffle; vacancy
                 tracking cycles; global exchange; dynamical remapping;
                 MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.",
  title =        "{MPI} and {OpenMP} Paradigms on Cluster of {SMP}
                 Architectures",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf",
  abstract =     "We investigate remapping multi-dimensional arrays on
                 cluster of SMP architectures under OpenMP, MPI, and
                 hybrid paradigms. Traditional method of array transpose
                 needs an auxiliary array of the same size and a copy
                 back stage. We recently developed an in-place method
                 using vacancy tracking cycles. The vacancy tracking
                 algorithm outperforms the traditional 2-array method as
                 demonstrated by extensive comparisons. The independence
                 of vacancy tracking cycles allows efficient
                 parallelization of the in-place method on SMP
                 architectures at node level. Performance of
                 multi-threaded parallelism using OpenMP are tested with
                 different scheduling methods and different number of
                 threads. The vacancy tracking method is parallelized
                 using several parallel paradigms. At node level, pure
                 OpenMP outperforms pure MPI by a factor of 2.76. Across
                 entire cluster of SMP nodes, the hybrid MPI/OpenMP
                 implementation outperforms pure MPI by a factor of
                 4.44, demonstrating the validity of the parallel
                 paradigm of mixing MPI with OpenMP.",
  acknowledgement = ack-nhfb,
}

@Article{Donnelly:2002:LTT,
  author =       "Austin Donnelly",
  title =        "Lightweight Thread Tunnelling in Network
                 Applications",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2546",
  pages =        "48--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:58:13 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2546.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2546/25460048.htm;
                 http://link.springer.de/link/service/series/0558/papers/2546/25460048.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Edelstein:2002:MJP,
  author =       "O. Edelstein and E. Farchi and Y. Nir and G. Ratsaby
                 and S. Ur",
  title =        "Multithreaded {Java} program test generation",
  journal =      j-IBM-SYS-J,
  volume =       "41",
  number =       "1",
  pages =        "111--125",
  month =        "????",
  year =         "2002",
  CODEN =        "IBMSA7",
  ISSN =         "0018-8670",
  bibdate =      "Tue Feb 12 17:23:05 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/sj/411/edelstein.html;
                 http://www.research.ibm.com/journal/sj/411/edelstein.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Systems Journal",
  ordernumber =  "G321-0144",
}

@Article{Editors:2002:LUC,
  author =       "{The Editors} and Kim Reidar Lantz and Ze'ev Atlas and
                 Pete Nelson and Gus J. Grubba",
  title =        "Letters: {URL} Correction [``{The NewOS Operating
                 System}'']; Passing Context to Threads; Compiling
                 {Perl\slash Tk} Scripts; Standing by {Al}'s Principles;
                 Understanding Photomosaics",
  journal =      j-DDJ,
  volume =       "27",
  number =       "1",
  pages =        "10, 12",
  month =        jan,
  year =         "2002",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Tue Feb 12 05:21:41 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "See \cite{Geiselbrecht:2001:NOS}.",
  URL =          "http://www.ddj.com/",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@InProceedings{El-Ghazawi:2002:UPP,
  author =       "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet",
  title =        "{UPC} Performance and Potential: a {NPB}
                 Experimental Study",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf",
  abstract =     "UPC, or Unified Parallel C, is a parallel extension of
                 ANSI C. UPC follows a distributed shared memory
                 programming model aimed at leveraging the ease of
                 programming of the shared memory paradigm, while
                 enabling the exploitation of data locality. UPC
                 incorporates constructs that allow placing data near
                 the threads that manipulate them to minimize remote
                 accesses. This paper gives an overview of the concepts
                 and features of UPC and establishes, through extensive
                 performance measurements of NPB workloads, the
                 viability of the UPC programming language compared to
                 the other popular paradigms. Further, through
                 performance measurements we identify the challenges,
                 the remaining steps and the priorities for UPC. It will
                 be shown that with proper hand tuning libraries, UPC
                 performance will be comparable incorporating such
                 improvements into automatic compare quite favorably to
                 message passing in ease and optimized collective
                 operations to that of MPI. Furthermore, by compiler
                 optimizations, UPC will of programming.",
  acknowledgement = ack-nhfb,
  keywords =     "NPB (NAS Parallel Benchmark)",
}

@Article{Feuerstein:2002:LMT,
  author =       "E. Feuerstein and A. Strejilevich de Loma",
  title =        "On-Line Multi-Threaded Paging",
  journal =      j-ALGORITHMICA,
  volume =       "32",
  number =       "1",
  pages =        "36--60",
  month =        jan,
  year =         "2002",
  CODEN =        "ALGOEJ",
  DOI =          "https://doi.org/10.1007/s00453-001-0073-z",
  ISSN =         "0178-4617 (print), 1432-0541 (electronic)",
  ISSN-L =       "0178-4617",
  MRclass =      "68N25 (68Q10 68W05)",
  MRnumber =     "MR1867023 (2002h:68033)",
  bibdate =      "Fri Jan 6 11:38:14 MST 2006",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0178-4617&volume=32&issue=1;
                 https://www.math.utah.edu/pub/tex/bib/index-table-a.html#algorithmica;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 MathSciNet database",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0178-4617&volume=32&issue=1&spage=36",
  acknowledgement = ack-nhfb,
  fjournal =     "Algorithmica",
  journal-URL =  "http://link.springer.com/journal/453",
}

@Article{Flanagan:2002:MCM,
  author =       "Cormac Flanagan and Shaz Qadeer and Sanjit A. Seshia",
  title =        "A Modular Checker for Multithreaded Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2404",
  pages =        "180--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:05 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2404.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2404/24040180.htm;
                 http://link.springer.de/link/service/series/0558/papers/2404/24040180.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Book{Garg:2002:TOA,
  author =       "Rajat P. Garg and Ilya Sharapov",
  title =        "Techniques for optimizing applications: high
                 performance computing",
  publisher =    pub-SUN-MICROSYSTEMS-PRESS,
  address =      pub-SUN-MICROSYSTEMS-PRESS:adr,
  pages =        "xliii + 616",
  year =         "2002",
  ISBN =         "0-13-093476-3",
  ISBN-13 =      "978-0-13-093476-5",
  LCCN =         "QA76.88 .G37 2002",
  bibdate =      "Fri Apr 11 08:26:42 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sun.com/blueprints/",
  series =       "Sun BluePrints Program",
  URL =          "http://www.sun.com/books/catalog/garg.html/index.html",
  acknowledgement = ack-nhfb,
  annote =       "From the Web site: The \verb=HPC_code_examples.tar.Z=
                 tar-file contains the source code, makefiles, and shell
                 scripts required to compile, link, and run the example
                 programs discussed in the book.",
  keywords =     "Forte Developer; MPI; OpenMP; Sun ClusterTools; Sun
                 Solaris",
}

@Article{Haggar:2002:JQD,
  author =       "Peter Haggar",
  title =        "{Java Q\&A}: Does {Java} Guarantee Thread Safety?",
  journal =      j-DDJ,
  volume =       "27",
  number =       "6",
  pages =        "91--83",
  month =        jun,
  year =         "2002",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Wed May 1 15:43:59 MDT 2002",
  bibsource =    "http://www.ddj.com/articles/2002/0206/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Comments on lack of atomic-update guarantee in Java
                 for objects larger than 32 bits, such as {\tt long} and
                 {\tt double}, with sample code to exhibit the
                 failure.",
  URL =          "http://www.ddj.com/ftp/2002/2002_06/jqa0602.txt",
  abstract =     "Additional resources include jqa0602.txt (listings).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Hanson:2002:AFI,
  author =       "Richard J. Hanson and Clay P. Breshears and Henry A.
                 Gabb",
  title =        "{Algorithm 821}: a {Fortran} interface to {POSIX}
                 threads",
  journal =      j-TOMS,
  volume =       "28",
  number =       "3",
  pages =        "354--371",
  month =        sep,
  year =         "2002",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/569147.569152",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Sat Nov 9 11:16:50 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Pthreads is the library of POSIX standard functions
                 for concurrent, multithreaded programming. The POSIX
                 standard only defines an application programming
                 interface (API) to the C programming language, not to
                 Fortran. Many scientific and engineering applications
                 are written in Fortran. Also, many of these
                 applications exhibit functional, or task-level,
                 concurrency. They would benefit from multithreading,
                 especially on symmetric multiprocessors (SMP). We
                 present here an interface to that part of the Pthreads
                 library that is compatible with standard Fortran. The
                 contribution consists of two primary source files: a
                 Fortran module and a collection of C wrappers to
                 Pthreads functions. The Fortran module defines the data
                 structures, interface and initialization routines used
                 to manage threads. The stability and portability of the
                 Fortran API to Pthreads is demonstrated using common
                 mathematical computations on three different systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@InProceedings{Karniadakis:2002:DLP,
  author =       "Suchuan Dong and George Em. Karniadakis",
  title =        "Dual-Level Parallelism for Deterministic and
                 Stochastic {CFD} Problems",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf",
  abstract =     "A hybrid two-level parallelism using MPI/OpenMP is
                 implemented in the general-purpose spectral/hp element
                 CFD code NekTar to take advantage of the hierarchical
                 structures arising in deterministic and stochastic CFD
                 problems. We take a coarse grain approach to
                 shared-memory parallelism with OpenMP and employ a
                 workload-splitting scheme that can reduce the OpenMP
                 synchronizations to the minimum. The hybrid
                 implementation shows good scalability with respect to
                 both the problem size and the number of processors in
                 case of a fixed problem size. With the same number of
                 processors, the hybrid model with 2 (or 4) OpenMP
                 threads per MPI process is observed to perform better
                 than pure MPI and pure OpenMP on the NCSA SGI Origin
                 2000, while the pure MPI model performs the best on the
                 IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC.
                 A key new result is that the use of threads facilitates
                 effectively prefinement, which is crucial to adaptive
                 discretization using high-order methods.",
  acknowledgement = ack-nhfb,
}

@Article{Kavi:2002:MMA,
  author =       "Krishna M. Kavi and Alireza Moshtaghi and Deng-jyi
                 Chen",
  title =        "Modeling Multithreaded Applications Using {Petri}
                 Nets",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "30",
  number =       "5",
  pages =        "353--371",
  month =        oct,
  year =         "2002",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1023/A:1019917329895",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 6 16:40:00 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=30&issue=5;
                 http://www.kluweronline.com/issn/0885-7458;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ipsapp009.lwwonline.com/content/getfile/4773/29/1/abstract.htm;
                 http://ipsapp009.lwwonline.com/content/getfile/4773/29/1/fulltext.pdf;
                 http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=30&issue=5&spage=353",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Kempf:2002:BTL,
  author =       "Bill Kempf",
  title =        "The {Boost.Threads} Library",
  journal =      j-CCCUJ,
  volume =       "20",
  number =       "5",
  pages =        "6--??",
  month =        may,
  year =         "2002",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:36 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Standard C++ threads are imminent. CUJ predicts they
                 will derive from the Boost.Threads library, explored
                 here by the eminent author.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Laneve:2002:TSJ,
  author =       "Cosimo Laneve",
  title =        "A type system for {JVM} threads",
  journal =      j-THEOR-COMP-SCI,
  volume =       "290",
  number =       "1",
  pages =        "741--778",
  month =        oct,
  year =         "2002",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Wed Nov 20 18:15:29 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@Article{Leman:2002:EFT,
  author =       "Dmitri Leman",
  title =        "An Efficient and Flexible Tracing Technique",
  journal =      j-CCCUJ,
  volume =       "20",
  number =       "4",
  pages =        "24--??",
  month =        apr,
  year =         "2002",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:36 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2002/0204/0204toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This extensible tracing framework tames the dreaded
                 multithreaded debugging demon.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Mahinthakumar:2002:HMO,
  author =       "G. Mahinthakumar and F. Saied",
  title =        "A Hybrid {MPI-OpenMP} Implementation of an Implicit
                 Finite-Element Code on Parallel Architectures",
  journal =      j-IJHPCA,
  volume =       "16",
  number =       "4",
  pages =        "371--393",
  month =        "Winter",
  year =         "2002",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Martinez:2002:SSAa,
  author =       "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
  title =        "Speculative synchronization: applying thread-level
                 speculation to explicitly parallel applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "18--29",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Martinez:2002:SSAb,
  author =       "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
  title =        "Speculative synchronization: applying thread-level
                 speculation to explicitly parallel applications",
  journal =      j-SIGPLAN,
  volume =       "37",
  number =       "10",
  pages =        "18--29",
  month =        oct,
  year =         "2002",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 15 12:23:09 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Martinez:2002:SSAc,
  author =       "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
  title =        "Speculative synchronization: applying thread-level
                 speculation to explicitly parallel applications",
  journal =      j-OPER-SYS-REV,
  volume =       "36",
  number =       "5",
  pages =        "18--29",
  month =        dec,
  year =         "2002",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Mauer:2002:FST,
  author =       "Carl J. Mauer and Mark D. Hill and David A. Wood",
  title =        "Full-system timing-first simulation",
  journal =      j-SIGMETRICS,
  volume =       "30",
  number =       "1",
  pages =        "108--116",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/511334.511349",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:38:22 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Computer system designers often evaluate future design
                 alternatives with detailed simulators that strive for
                 {\em functional fidelity\/} (to execute relevant
                 workloads) and {\em performance fidelity\/} (to rank
                 design alternatives). Trends toward multi-threaded
                 architectures, more complex micro-architectures, and
                 richer workloads, make authoring detailed simulators
                 increasingly difficult. To manage simulator complexity,
                 this paper advocates decoupled simulator organizations
                 that separate functional and performance concerns.
                 Furthermore, we define an approach, called {\em
                 timing-first simulation}, that uses an augmented timing
                 simulator to execute instructions important to
                 performance in conjunction with a functional simulator
                 to insure correctness. This design simplifies software
                 development, leverages existing simulators, and can
                 model micro-architecture timing in detail. We describe
                 the timing-first organization and our experiences
                 implementing TFsim, a full-system multiprocessor
                 performance simulator. TFsim models a pipelined,
                 out-of-order micro-architecture in detail, was
                 developed in less than one person-year, and performs
                 competitively with previously-published simulators.
                 TFsim's timing simulator implements dynamically common
                 instructions (99.99\% of them), while avoiding the vast
                 and exacting implementation efforts necessary to run
                 unmodified commercial operating systems and workloads.
                 Virtutech Simics, a full-system functional simulator,
                 checks and corrects the timing simulator's execution,
                 contributing 18-36\% to the overall run-time. TFsim's
                 mostly correct functional implementation introduces a
                 worst-case performance error of 4.8\% for our
                 commercial workloads. Some additional simulator
                 performance is gained by verifying functional
                 correctness less often, at the cost of some additional
                 performance error.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Mukherjee:2002:DDE,
  author =       "Shubhendu S. Mukherjee and Michael Kontz and Steven K.
                 Reinhardt",
  title =        "Detailed design and evaluation of redundant
                 multithreading alternatives",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "99--110",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Oplinger:2002:ESRa,
  author =       "Jeffrey Oplinger and Monica S. Lam",
  title =        "Enhancing software reliability with speculative
                 threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "184--196",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Oplinger:2002:ESRb,
  author =       "Jeffrey Oplinger and Monica S. Lam",
  title =        "Enhancing software reliability with speculative
                 threads",
  journal =      j-SIGPLAN,
  volume =       "37",
  number =       "10",
  pages =        "184--196",
  month =        oct,
  year =         "2002",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 15 12:23:09 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Oplinger:2002:ESRc,
  author =       "Jeffrey Oplinger and Monica S. Lam",
  title =        "Enhancing software reliability with speculative
                 threads",
  journal =      j-OPER-SYS-REV,
  volume =       "36",
  number =       "5",
  pages =        "184--196",
  month =        dec,
  year =         "2002",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Plachetka:2002:QTS,
  author =       "Tomas Plachetka",
  title =        "(Quasi-) Thread-Safe {PVM} and (Quasi-) Thread-Safe
                 {MPI} without Active Polling",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2474",
  pages =        "296--??",
  year =         "2002",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Sat Nov 30 20:57:35 MST 2002",
  bibsource =    "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.de/link/service/series/0558/bibs/2474/24740296.htm;
                 http://link.springer.de/link/service/series/0558/papers/2474/24740296.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Sato:2002:SJL,
  author =       "Y. Sato",
  title =        "A Study of {Java} Language for Effective Thread
                 Migration",
  journal =      "Record of Electrical and Communication Engineering
                 Conversazione Tohoku University",
  volume =       "71",
  number =       "1",
  publisher =    "Tohoku Daigaku Dentsu Danwakai",
  pages =        "597--598",
  year =         "2002",
  CODEN =        "????",
  ISSN =         "0385-7719",
  bibdate =      "Tue Dec 24 07:09:37 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
}

@Article{Shene:2002:TST,
  author =       "Ching-Kuang Shene",
  title =        "{ThreadMentor}: a system for teaching multithreaded
                 programming",
  journal =      j-SIGCSE,
  volume =       "34",
  number =       "3",
  pages =        "229--229",
  month =        sep,
  year =         "2002",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/637610.544497",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 16:56:56 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Snavely:2002:SJP,
  author =       "Allan Snavely and Dean M. Tullsen and Geoff Voelker",
  title =        "Symbiotic jobscheduling with priorities for a
                 simultaneous multithreading processor",
  journal =      j-SIGMETRICS,
  volume =       "30",
  number =       "1",
  pages =        "66--76",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/511399.511343",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:38:22 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous Multithreading machines benefit from
                 jobscheduling software that monitors how well
                 coscheduled jobs share CPU resources, and coschedules
                 jobs that interact well to make more efficient use of
                 those resources. As a result, informed coscheduling can
                 yield significant performance gains over naive
                 schedulers. However, prior work on coscheduling focused
                 on equal-priority job mixes, which is an unrealistic
                 assumption for modern operating systems. This paper
                 demonstrates that a scheduler for an SMT machine can
                 both satisfy process priorities and symbiotically
                 schedule low and high priority threads to increase
                 system throughput. Naive priority schedulers dedicate
                 the machine to high priority jobs to meet priority
                 goals, and as a result decrease opportunities for
                 increased performance from multithreading and
                 coscheduling. More informed schedulers, however, can
                 dynamically monitor the progress and resource
                 utilization of jobs on the machine, and dynamically
                 adjust the degree of multithreading to improve
                 performance while still meeting priority goals. Using
                 detailed simulation of an SMT architecture, we
                 introduce and evaluate a series of five software and
                 hardware-assisted priority schedulers. Overall, our
                 results indicate that coscheduling priority jobs can
                 significantly increase system throughput by as much as
                 40\%, and that (1) the benefit depends upon the
                 relative priority of the coscheduled jobs, and (2) more
                 sophisticated schedulers are more effective when the
                 differences in priorities are greatest. We show that
                 our priority schedulers can decrease average turnaround
                 times for a random job mix by as much as 33\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "job scheduling; priorities; simultaneous
                 multithreading",
}

@Article{Sodan:2002:AMA,
  author =       "Angela C. Sodan",
  title =        "Applications on a multithreaded architecture: a case
                 study with {EARTH-MANNA}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "28",
  number =       "1",
  pages =        "3--33",
  month =        jan,
  year =         "2002",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Fri Feb 22 16:52:43 MST 2002",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.elsevier.com/gej-ng/10/35/21/60/27/28/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/60/27/28/00001684.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Solihin:2002:UUL,
  author =       "Yan Solihin and Jaejin Lee and Josep Torrellas",
  title =        "Using a user-level memory thread for correlation
                 prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "171--182",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@InProceedings{Sterling:2002:GMP,
  author =       "Thomas L. Sterling and Hans P. Zima",
  title =        "{Gilgamesh}: a Multithreaded Processor-In-Memory
                 Architecture for Petaflops Computing",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf",
  abstract =     "Processor-in-Memory (PIM) architectures avoid the von
                 Neumann bottleneck in conventional machines by
                 integrating high-density DRAM and CMOS logic on the
                 same chip. Parallel systems based on this new
                 technology are expected to provide higher scalability,
                 adaptability, robustness, fault tolerance and lower
                 power consumption than current MPPs or commodity
                 clusters. In this paper we describe the design of
                 Gilgamesh, a PIM-based massively parallel architecture,
                 and elements of its execution model. Gilgamesh extends
                 existing PIM capabilities by incorporating advanced
                 mechanisms for virtualizing tasks and data and
                 providing adaptive resource management for load
                 balancing and latency tolerance. The Gilgamesh
                 execution model is based on macroservers, a middleware
                 layer which supports object-based runtime management of
                 data and threads allowing explicit and dynamic control
                 of locality and load balancing. The paper concludes
                 with a discussion of related research activities and an
                 outlook to future work.",
  acknowledgement = ack-nhfb,
}

@Article{Stoller:2002:MCM,
  author =       "Scott D. Stoller",
  title =        "Model-checking multi-threaded distributed {Java}
                 programs",
  journal =      j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER,
  volume =       "4",
  number =       "1",
  pages =        "71--91",
  month =        oct,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10009-002-0077-2",
  ISSN =         "1433-2779 (print), 1433-2787 (electronic)",
  ISSN-L =       "1433-2779",
  bibdate =      "Tue Nov 23 15:01:41 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal on Software Tools for Technology
                 Transfer: STTT",
}

@Article{Sung:2002:CPE,
  author =       "Minyoung Sung and Soyoung Kim and Sangsoo Park and
                 Naehyuck Chang and Heonshik Shin",
  title =        "Comparative performance evaluation of {Java} threads
                 for embedded applications: {Linux Thread} vs. {Green
                 Thread}",
  journal =      j-INFO-PROC-LETT,
  volume =       "84",
  number =       "4",
  pages =        "221--225",
  day =          "30",
  month =        nov,
  year =         "2002",
  CODEN =        "IFPLAT",
  ISSN =         "0020-0190 (print), 1872-6119 (electronic)",
  ISSN-L =       "0020-0190",
  bibdate =      "Mon Jan 26 08:44:30 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/00200190",
  acknowledgement = ack-nhfb,
  fjournal =     "Information Processing Letters",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00200190",
}

@Article{Tennberg:2002:RGO,
  author =       "Patrick Tennberg",
  title =        "Refactoring Global Objects in Multithreaded
                 Applications",
  journal =      j-CCCUJ,
  volume =       "20",
  number =       "5",
  pages =        "20--??",
  month =        may,
  year =         "2002",
  CODEN =        "CCUJEX",
  ISSN =         "1075-2838",
  bibdate =      "Tue May 14 18:09:36 MDT 2002",
  bibsource =    "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Although you may get fired for introducing any new
                 global variables, it's too much work to rewrite old
                 code to remove them. So make them thread-safe and stop
                 worrying.",
  acknowledgement = ack-nhfb,
  fjournal =     "C/C++ Users Journal",
}

@Article{Theobald:2002:IEC,
  author =       "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal
                 and Gerd Heber and Ruppa K. Thulasiram and Guang R.
                 Gao",
  title =        "Implementation and evaluation of a communication
                 intensive application on the {EARTH} multithreaded
                 system",
  journal =      j-CCPE,
  volume =       "14",
  number =       "3",
  pages =        "183--201",
  month =        mar,
  year =         "2002",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.604",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 18 14:54:00 MDT 2002",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/93513486/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=93513486{\&}PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Thulasiraman:2002:EMA,
  author =       "Parimala Thulasiraman and Kevin Theobald and Ashfaq A.
                 Khokhar and Guang R. Gao",
  title =        "Efficent Multithreaded Algorithms for the {Fast
                 Fourier Transform}",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "5",
  number =       "2",
  pages =        "239--258",
  month =        jun,
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Thu Sep 2 12:08:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.scpe.org/content/5/2.toc",
  acknowledgement = ack-nhfb,
  fjournal =     "PDCP: Parallel and Distributed Computing Practices",
}

@Article{Ungerer:2002:MP,
  author =       "Theo Ungerer and Borut Robi{\v{c}} and Jurij
                 {\v{S}}ilc",
  title =        "Multithreaded Processors",
  journal =      j-COMP-J,
  volume =       "45",
  number =       "3",
  pages =        "320--348",
  month =        "????",
  year =         "2002",
  CODEN =        "CMPJA6",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Fri May 10 10:12:07 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/",
  URL =          "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/450320.sgm.abs.html;
                 http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/pdf/450320.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
}

@Article{Ungerer:2002:SPE,
  author =       "Theo Ungerer and Borut Robi{\v{c}} and Jurij
                 {\v{S}}ilc",
  title =        "A survey of processors with explicit multithreading",
  journal =      j-COMP-SURV,
  volume =       "35",
  number =       "1",
  pages =        "29--63",
  month =        mar,
  year =         "2002",
  CODEN =        "CMSVAN",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Thu Aug 7 06:57:01 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
}

@Article{Vijaykumar:2002:TFR,
  author =       "T. N. Vijaykumar and Irith Pomeranz and Karl Cheng",
  title =        "Transient-fault recovery using simultaneous
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "87--98",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Wang:2002:SPE,
  author =       "Hong Wang and Perry H. Wang and Ross Dave Weldon and
                 Scott M. Ettinger and Hideki Saito and Milind Girkar
                 and Steve Shih-wei Liao and John P. Shen",
  title =        "Speculative Precomputation: Exploring the Use of
                 Multithreading for Latency Tools",
  journal =      j-INTEL-TECH-J,
  volume =       "6",
  number =       "1",
  pages =        "22--35",
  month =        feb,
  year =         "2002",
  ISSN =         "1535-766X",
  bibdate =      "Thu Feb 28 15:24:21 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://developer.intel.com/technology/itj/2002/volume06issue01/vol6iss1_hyper_threading_technology.pdf",
}

@Article{Yan:2002:RCC,
  author =       "C. Yan",
  title =        "Race condition and concurrency safety of multithreaded
                 object-oriented programming in {Java}",
  journal =      "IEEE International Conference on Systems Man and
                 Cybernetics",
  volume =       "6",
  pages =        "??--??",
  year =         "2002",
  CODEN =        "????",
  ISSN =         "1062-922X",
  bibdate =      "Tue Apr 8 06:53:44 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  xxpages =      "WA1Q3",
}

@Article{Zhai:2002:COSa,
  author =       "Antonia Zhai and Christopher B. Colohan and J. Gregory
                 Steffan and Todd C. Mowry",
  title =        "Compiler optimization of scalar value communication
                 between speculative threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "171--183",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Zhai:2002:COSb,
  author =       "Antonia Zhai and Christopher B. Colohan and J. Gregory
                 Steffan and Todd C. Mowry",
  title =        "Compiler optimization of scalar value communication
                 between speculative threads",
  journal =      j-SIGPLAN,
  volume =       "37",
  number =       "10",
  pages =        "171--183",
  month =        oct,
  year =         "2002",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 15 12:23:09 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Zuberek:2002:APB,
  author =       "W. M. Zuberek",
  title =        "Analysis of Performance Bottlenecks in Multithreaded
                 Multiprocessor Systems",
  journal =      j-FUND-INFO,
  volume =       "50",
  number =       "2",
  pages =        "223--241",
  month =        feb,
  year =         "2002",
  CODEN =        "FUMAAJ",
  ISSN =         "0169-2968 (print), 1875-8681 (electronic)",
  ISSN-L =       "0169-2968",
  bibdate =      "Sat Mar 5 16:59:23 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fundinfo2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Fundamenta Informaticae",
  journal-URL =  "http://content.iospress.com/journals/fundamenta-informaticae",
}

@Article{Aamodt:2003:FMO,
  author =       "Tor M. Aamodt and Pedro Marcuello and Paul Chow and
                 Antonio Gonz{\'a}lez and Per Hammarlund and Hong Wang
                 and John P. Shen",
  title =        "A framework for modeling and optimization of prescient
                 instruction prefetch",
  journal =      j-SIGMETRICS,
  volume =       "31",
  number =       "1",
  pages =        "13--24",
  month =        jun,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/781027.781030",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Thu Jun 26 11:41:41 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper describes a framework for modeling
                 macroscopic program behavior and applies it to
                 optimizing prescient instruction prefetch --- novel
                 technique that uses helper threads to improve
                 single-threaded application performance by performing
                 judicious and timely instruction prefetch. A helper
                 thread is initiated when the main thread encounters a
                 spawn point, and prefetches instructions starting at a
                 distant target point. The target identifies a code
                 region tending to incur I-cache misses that the main
                 thread is likely to execute soon, even though
                 intervening control flow may be unpredictable. The
                 optimization of spawn-target pair selections is
                 formulated by modeling program behavior as a Markov
                 chain based on profile statistics. Execution paths are
                 considered stochastic outcomes, and aspects of program
                 behavior are summarized via path expression mappings.
                 Mappings for computing reaching, and posteriori
                 probability; path length mean, and variance; and
                 expected path footprint are presented. These are used
                 with Tarjan's fast path algorithm to efficiently
                 estimate the benefit of spawn-target pair selections.
                 Using this framework we propose a spawn-target pair
                 selection algorithm for prescient instruction prefetch.
                 This algorithm has been implemented, and evaluated for
                 the Itanium Processor Family architecture. A limit
                 study finds 4.8\%to 17\% speedups on an in-order
                 simultaneous multithreading processor with eight
                 contexts, over nextline and streaming I-prefetch for a
                 set of benchmarks with high I-cache miss rates. The
                 framework in this paper is potentially applicable to
                 other thread speculation techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "analytical modeling; helper threads; instruction
                 prefetch; multithreading; optimization; path
                 expressions",
}

@Article{Abraham:2003:TSP,
  author =       "E. Abraham and F. S. deBoer and W. P. deRoever and M.
                 Steffen",
  title =        "A Tool-Supported Proof System for Multithreaded
                 {Java}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2852",
  pages =        "1--32",
  year =         "2003",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Nov 11 05:21:36 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Addison:2003:OIA,
  author =       "C. Addison and Y. Ren and M. van Waveren",
  title =        "{OpenMP} issues arising in the development of parallel
                 {BLAS} and {LAPACK} libraries",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "95--104",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Almasi:2003:DCD,
  author =       "George Alm{\'a}si and C{\u{a}}lin Ca{\c{s}}caval and
                 Jos{\'e} G. Casta{\~n}os and Monty Denneau and Derek
                 Lieber and Jos{\'e} E. Moreira and Henry S. {Warren,
                 Jr.}",
  title =        "Dissecting {Cyclops}: a detailed analysis of a
                 multithreaded architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "26--38",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Balis:2003:MSM,
  author =       "Bartosz Bali{\'s} and Marian Bubak and W{\l}odzimierz
                 Funika and Roland Wism{\"u}ller",
  title =        "A monitoring system for multithreaded applications",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "19",
  number =       "5",
  pages =        "641--650",
  month =        jul,
  year =         "2003",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Sat Jan 10 10:03:34 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
  remark =       "Tools for Program Development and Analysis. Best
                 papers from two Technical Sessions, at ICCS2001, San
                 Francisco, CA, USA, and ICCS2002, Amsterdam, The
                 Netherlands.",
}

@Article{Barekas:2003:MAO,
  author =       "Vasileios K. Barekas and Panagiotis E. Hadjidoukas and
                 Eleftherios D. Polychronopoulos and others",
  title =        "A Multiprogramming Aware {OpenMP} Implementation",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "133--141",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Brightwell:2003:DIP,
  author =       "Ron Brightwell and Rolf Riesen and Arthur B. Maccabe",
  title =        "Design, Implementation, and Performance of {MPI} on
                 {Portals 3.0}",
  journal =      j-IJHPCA,
  volume =       "17",
  number =       "1",
  pages =        "7--20",
  month =        "Spring",
  year =         "2003",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Briguglio:2003:PPM,
  author =       "Sergio Briguglio and Beniamino Di Martino and Gregorio
                 Vlad",
  title =        "A performance-prediction model for {PIC} applications
                 on clusters of Symmetric MultiProcessors: Validation
                 with hierarchical {HPF $+$ OpenMP} implementation",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "159--176",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@Article{Carr:2003:TPT,
  author =       "Steve Carr and Jean Mayo and Ching-Kuang Shene",
  title =        "{ThreadMentor}: a pedagogical tool for multithreaded
                 programming",
  journal =      j-JERIC,
  volume =       "3",
  number =       "1",
  pages =        "1--30",
  month =        mar,
  year =         "2003",
  CODEN =        "????",
  ISSN =         "1531-4278",
  bibdate =      "Tue Feb 3 18:43:37 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jeric/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Educational Resources in Computing
                 (JERIC)",
}

@InProceedings{Chakravarti:2003:ISM,
  author =       "A. Chakravarti and X. Wang and J. Hallstrom and G.
                 Baumgartner",
  booktitle =    "Proceedings of the International Conference on
                 Parallel Processing",
  title =        "Implementation of Strong Mobility for Multi-threaded
                 Agents in {Java}",
  publisher =    "????",
  address =      "????",
  pages =        "321--332",
  year =         "2003",
  CODEN =        "????",
  ISSN =         "0190-3918",
  bibdate =      "Tue Dec 2 18:51:43 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
}

@Article{Chen:2003:CSS,
  author =       "Peng-Sheng Chen and Ming-Yu Hung and Yuan-Shin Hwang
                 and Roy Dz-Ching Ju and Jenq Kuen Lee",
  title =        "Compiler support for speculative multithreading
                 architecture with probabilistic points-to analysis",
  journal =      j-SIGPLAN,
  pages =        "25--36",
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Domani:2003:TLH,
  author =       "Tamar Domani and Gal Goldshtein and Elliot K. Kolodner
                 and Ethan Lewis and Erez Petrank and Dafna Sheinwald",
  title =        "Thread-Local Heaps for {Java}",
  journal =      j-SIGPLAN,
  volume =       "38",
  number =       "2s",
  pages =        "183--194",
  month =        feb,
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 15 12:23:14 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Edelstein:2003:FTM,
  author =       "Orit Edelstein and Eitan Farchi and Evgeny Goldin and
                 Yarden Nir and Gil Ratsaby and Shmuel Ur",
  title =        "Framework for testing multi-threaded {Java} programs",
  journal =      j-CCPE,
  volume =       "15",
  number =       "3--5",
  pages =        "485--499",
  month =        mar # "\slash " # apr,
  year =         "2003",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.654",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:08 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 Feb 2003",
}

@Article{Fang:2003:DGO,
  author =       "Weijian Fang and Cho-Li Wang and Francis C. M. Lau",
  title =        "On the design of global object space for efficient
                 multi-threading {Java} computing on clusters",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "29",
  number =       "11--12",
  pages =        "1563--1587",
  month =        nov # "\slash " # dec,
  year =         "2003",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Dec 24 09:07:29 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Gagnon:2003:EIT,
  author =       "E. Gagnon and L. Hendren",
  title =        "Effective Inline-Threaded Interpretation of {Java}
                 Bytecode Using Preparation Sequences",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2622",
  pages =        "170--184",
  year =         "2003",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 15 07:54:18 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Gould:2003:GLT,
  author =       "Nicholas I. M. Gould and Dominique Orban and Philippe
                 L. Toint",
  title =        "{GALAHAD}, a library of thread-safe {Fortran 90}
                 packages for large-scale nonlinear optimization",
  journal =      j-TOMS,
  volume =       "29",
  number =       "4",
  pages =        "353--372",
  month =        dec,
  year =         "2003",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/962437.962438",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Jan 5 17:18:49 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We describe the design of version 1.0 of GALAHAD, a
                 library of Fortran 90 packages for large-scale
                 nonlinear optimization. The library particularly
                 addresses quadratic programming problems, containing
                 both interior point and active set algorithms, as well
                 as tools for preprocessing problems prior to solution.
                 It also contains an updated version of the venerable
                 nonlinear programming package, LANCELOT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Grossman:2003:TSM,
  author =       "Dan Grossman",
  title =        "Type-safe multithreading in cyclone",
  journal =      j-SIGPLAN,
  volume =       "38",
  number =       "3",
  pages =        "13--25",
  month =        mar,
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu May 15 12:23:16 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Heinlein:2003:ATS,
  author =       "C. Heinlein",
  title =        "Advanced Thread Synchronization in {Java} Using
                 Interaction Expressions",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2591",
  pages =        "345--365",
  year =         "2003",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 1 06:09:06 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Jin:2003:AMP,
  author =       "Haoqiang Jin and Gabriele Jost and Jerry Yan and
                 others",
  title =        "Automatic multilevel parallelization using {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "177--190",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Kee:2003:POP,
  author =       "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
  title =        "{ParADE}: An {OpenMP} Programming Environment for
                 {SMP} Cluster Systems",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap130.pdf",
  abstract =     "Demand for programming environments to exploit
                 clusters of symmetric multiprocessors (SMPs) is
                 increasing. In this paper, we present a new programming
                 environment, called ParADE, to enable easy, portable,
                 and high-performance programming on SMP clusters. It is
                 an OpenMP programming environment on top of a
                 multi-threaded software distributed shared memory
                 (SDSM) system with a variant of home-based lazy release
                 consistency protocol. To boost performance, the runtime
                 system provides explicit message-passing primitives to
                 make it a hybrid-programming environment. Collective
                 communication primitives are used for the
                 synchronization and work-sharing directives associated
                 with small data structures, lessening the
                 synchronization overhead and avoiding the implicit
                 barriers of work-sharing directives. The OpenMP
                 translator bridges the gap between the OpenMP
                 abstraction and the hybrid programming interfaces of
                 the runtime system. The experiments with several NAS
                 benchmarks and applications on a Linux-based cluster
                 show promising results that ParADE overcomes the
                 performance problem of the conventional SDSM-based
                 OpenMP environment.",
  acknowledgement = ack-nhfb,
  keywords =     "hybrid programming; MPI; OpenMP; programming
                 environment; SMP cluster; software distributed shared
                 memory",
}

@Article{Keen:2003:CCP,
  author =       "Aaron W. Keen and Takashi Ishihara and Justin T. Maris
                 and Tiejun Li and Eugene F. Fodor and Ronald A.
                 Olsson",
  title =        "A comparison of concurrent programming and cooperative
                 multithreading",
  journal =      j-CCPE,
  volume =       "15",
  number =       "1",
  pages =        "27--53",
  month =        jan,
  year =         "2003",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.706",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:05 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "6 Jan 2003",
}

@Article{Kepner:2003:MTF,
  author =       "Jeremy Kepner",
  title =        "A multi-threaded fast convolver for dynamically
                 parallel image filtering",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "63",
  number =       "3",
  pages =        "360--372",
  month =        mar,
  year =         "2003",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Tue Dec 16 16:10:40 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@InProceedings{Klasky:2003:GBP,
  author =       "Scott Alan Klasky and Stephane Ethier and Zhihong Lin
                 and Kevin Martins and Doug McCune and Ravi Samtaney",
  title =        "Grid-Based Parallel Data Streaming implemented for the
                 Gyrokinetic Toroidal Code",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap207.pdf",
  abstract =     "We have developed a threaded parallel data streaming
                 approach using Globus to transfer multi-terabyte
                 simulation data from a remote supercomputer to the
                 scientist's home analysis/visualization cluster, as the
                 simulation executes, with negligible overhead. Data
                 transfer experiments show that this concurrent data
                 transfer approach is more favorable compared with
                 writing to local disk and then transferring this data
                 to be post-processed. The present approach is conducive
                 to using the grid to pipeline the simulation with
                 post-processing and visualization. We have applied this
                 method to the Gyrokinetic Toroidal Code (GTC), a
                 3-dimensional particle-in-cell code used to study
                 micro-turbulence in magnetic confinement fusion from
                 first principles plasma theory.",
  acknowledgement = ack-nhfb,
}

@Article{Koster:2003:TTI,
  author =       "Rainer Koster and Andrew P. Black and Jie Huang and
                 Jonathan Walpole and Calton Pu",
  title =        "Thread transparency in information flow middleware",
  journal =      j-SPE,
  volume =       "33",
  number =       "4",
  pages =        "321--349",
  month =        apr,
  year =         "2003",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.510",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Sat Nov 29 17:39:44 MST 2003",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "19 Feb 2003",
}

@Article{Koufaty:2003:HTN,
  author =       "David Koufaty and Deborah T. Marr",
  title =        "Hyperthreading Technology in the Netburst
                 Microarchitecture",
  journal =      j-IEEE-MICRO,
  volume =       "23",
  number =       "2",
  pages =        "56--65",
  month =        mar # "\slash " # apr,
  year =         "2003",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2003.1196115",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 23 18:57:11 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://dlib.computer.org/mi/books/mi2003/pdf/m2056.pdf;
                 http://www.computer.org/micro/mi2003/m2056abs.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Kranzlmuller:2003:RAP,
  author =       "Dieter Kranzlm{\"u}ller and Peter Kacsuk and Jack
                 Dongarra and Jens Volkert",
  title =        "Recent Advances in Parallel Virtual Machine and
                 Message Passing Interface (Select papers from the
                 {EuroPVMMPI 2002 Conference})",
  journal =      j-IJHPCA,
  volume =       "17",
  number =       "1",
  pages =        "3--5",
  month =        "Spring",
  year =         "2003",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Kreuzinger:2003:RTE,
  author =       "J. Kreuzinger and U. Brinkschulte and M. Pfeffer and
                 S. Uhrig and T. Ungerer",
  title =        "Real-time event-handling and scheduling on a
                 multithreaded {Java} microcontroller",
  journal =      j-MICROPROC-MICROSYS,
  volume =       "27",
  number =       "1",
  pages =        "19--31",
  year =         "2003",
  CODEN =        "MIMID5",
  ISSN =         "0141-9331 (print), 1872-9436 (electronic)",
  ISSN-L =       "0141-9331",
  bibdate =      "Tue Feb 18 07:16:21 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Microprocessors and Microsystems",
}

@Article{Kwok:2003:EHC,
  author =       "Yu-Kwong Kwok",
  title =        "On Exploiting Heterogeneity for Cluster Based Parallel
                 Multithreading Using Task Duplication",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "25",
  number =       "1",
  pages =        "63--72",
  month =        may,
  year =         "2003",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 16 08:27:09 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/abstract.htm;
                 http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Mantel:2003:UAS,
  author =       "Heiko Mantel and Andrei Sabelfeld",
  title =        "A unifying approach to the security of distributed and
                 multi-threaded programs",
  journal =      j-J-COMP-SECUR,
  volume =       "11",
  number =       "4",
  pages =        "615--676",
  month =        "????",
  year =         "2003",
  CODEN =        "JCSIET",
  DOI =          "https://doi.org/10.3233/JCS-2003-11406",
  ISSN =         "0926-227X (print), 1875-8924 (electronic)",
  ISSN-L =       "0926-227X",
  bibdate =      "Tue May 24 06:22:14 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computer Security",
  journal-URL =  "http://content.iospress.com/journals/journal-of-computer-security",
}

@Article{Marowka:2003:EOT,
  author =       "Ami Marowka",
  title =        "Extending {OpenMP} for Task Parallelism",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "13",
  number =       "3",
  pages =        "341--??",
  month =        sep,
  year =         "2003",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Sat Nov 6 18:06:31 MST 2004",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Mattson:2003:HGO,
  author =       "Timothy G. Mattson",
  title =        "How good is {OpenMP}",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "81--93",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{McAuley:2003:CVC,
  author =       "Derek McAuley and Rolf Neugebauer",
  title =        "A case for virtual channel processors",
  crossref =     "ACM:2003:ATA",
  pages =        "237--242",
  year =         "2003",
  DOI =          "https://doi.org/10.1145/944747.944758",
  bibdate =      "Sat Oct 14 14:03:33 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Modern desktop and server computer systems use
                 multiple processors: general purpose CPU(s), graphic
                 processor (GPU), network processors (NP) on Network
                 Interface Cards (NICs), RAID controllers, and signal
                 processors on sound cards and modems. Some of these
                 processors traditionally have been special purpose
                 processors but there is a trend towards replacing some
                 of these with embedded general purpose processors. At
                 the same time main CPUs become more powerful; desktop
                 CPUs start featuring Simultaneous Multi-Threading
                 (SMT); and Symmetric Multi-Processing (SMP) systems are
                 widely used in server systems. However, the structure
                 of operating systems has not really changed to reflect
                 these trends --- different types of processors evolve
                 at different time scales (largely driven by market
                 forces) requiring significant changes to operating
                 systems kernels to reflect the appropriate tradeoffs.In
                 this position paper we propose to re-vitalise the old
                 idea of channel processors by encapsulating operating
                 system I/O subsystems in Virtual Channel Processors
                 (VCPs). VCPs perform I/O operations on behalf of an OS.
                 They provide similar development, performance, and
                 fault isolation as dedicated (embedded) I/O processors
                 do while offering the flexibility to split
                 functionality between the main processor(s) and
                 dedicated processors without affecting the rest of the
                 OS. If part of a VCP is executed on the main processor,
                 we propose to make use of virtual machine technology
                 and SMT/SMP features to isolate its performance from
                 that of the rest of the system and to protect the
                 system from faults within the VCP.",
  acknowledgement = ack-nhfb,
}

@Article{McDowell:2003:ISS,
  author =       "Luke K. McDowell and Susan J. Eggers and Steven D.
                 Gribble",
  title =        "Improving server software support for simultaneous
                 multithreaded processors",
  journal =      j-SIGPLAN,
  pages =        "37--48",
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Muller:2003:OCB,
  author =       "Matthias S. M{\"u}ller",
  title =        "An {OpenMP} compiler benchmark",
  journal =      j-SCI-PROG,
  volume =       "11",
  number =       "2",
  pages =        "125--131",
  year =         "2003",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Mon Jan 12 06:28:15 MST 2004",
  bibsource =    "http://www.iospress.nl/site/html/10589244.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "http://iospress.metapress.com/content/1058-9244",
}

@InProceedings{Nakajima:2003:PIS,
  author =       "Kengo Nakajima",
  title =        "Parallel Iterative Solvers of {GeoFEM} with Selective
                 Blocking Preconditioning for Nonlinear Contact Problems
                 on the {Earth Simulator}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1;
                 http://www.sc-conference.org/sc2003/paperpdfs/pap155.pdf",
  abstract =     "An efficient parallel iterative method with selective
                 blocking preconditioning has been developed for
                 symmetric multiprocessor (SMP) cluster architectures
                 with vector processors such as the Earth Simulator.
                 This method is based on a three-level hybrid parallel
                 programming model, which includes message passing for
                 inter-SMP node communication, loop directives by OpenMP
                 for intra-SMP node parallelization and vectorization
                 for each processing element (PE). This method provides
                 robust and smooth convergence and excellent vector and
                 parallel performance in 3D geophysical simulations with
                 contact conditions performed on the Earth Simulator.
                 The selective blocking preconditioning is much more
                 efficient than ILU(1) and ILU(2). Performance for the
                 complicated Southwest Japan model with more than 23 M
                 DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was
                 161.7 GFLOPS, corresponding to 25.3\% of the peak
                 performance for hybrid programming model, and 190.4
                 GFLOPS (29.8\% of the peak performance) for flat MPI,
                 respectively.",
  acknowledgement = ack-nhfb,
}

@Article{Pang:2003:PSR,
  author =       "James C. Pang and Gholamali C. Shoja and Eric G.
                 Manning",
  title =        "Providing soft real-time quality of service guarantees
                 for {Java} threads",
  journal =      j-CCPE,
  volume =       "15",
  number =       "3--5",
  pages =        "521--538",
  month =        mar # "\slash " # apr,
  year =         "2003",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.663",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:08 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 Feb 2003",
}

@Article{Park:2003:IMP,
  author =       "Il Park and Babak Falsafi and T. N. Vijaykumar",
  title =        "Implicitly-multithreaded processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "39--51",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Petitpierre:2003:JTC,
  author =       "C. Petitpierre",
  title =        "{Java} Threads Can Be Very Useful Building Blocks",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "2604",
  pages =        "204",
  year =         "2003",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 1 06:09:06 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Pinilla:2003:UJT,
  author =       "Ruben Pinilla and Marisa Gil",
  title =        "{ULT}: a {Java} threads model for platform independent
                 execution",
  journal =      j-OPER-SYS-REV,
  volume =       "37",
  number =       "4",
  pages =        "48--62",
  month =        oct,
  year =         "2003",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Pozniansky:2003:EFD,
  author =       "Eli Pozniansky and Assaf Schuster",
  title =        "Efficient on-the-fly data race detection in
                 multithreaded {C++} programs",
  journal =      j-SIGPLAN,
  pages =        "179--190",
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Prabhu:2003:UTL,
  author =       "Manohar K. Prabhu and Kunle Olukotun",
  title =        "Using thread-level speculation to simplify manual
                 parallelization",
  journal =      j-SIGPLAN,
  pages =        "1--12",
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 22 16:52:42 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Price:2003:CAF,
  author =       "Gregory W. Price and David K. Lowenthal",
  title =        "A comparative analysis of fine-grain threads
                 packages",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "63",
  number =       "11",
  pages =        "1050--1063",
  month =        nov,
  year =         "2003",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Tue Dec 16 16:10:44 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Prvulovic:2003:RUT,
  author =       "Milos Prvulovic and Josep Torrellas",
  title =        "{ReEnact}: using thread-level speculation mechanisms
                 to debug data races in multithreaded codes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "110--121",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Rajwar:2003:TET,
  author =       "Ravi Rajwar and James Goodman",
  title =        "Transactional Execution: Toward Reliable,
                 High-Performance Multithreading",
  journal =      j-IEEE-MICRO,
  volume =       "23",
  number =       "6",
  pages =        "117--125",
  month =        nov # "\slash " # dec,
  year =         "2003",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2003.1261395",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Sat Jan 31 07:23:55 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/mags/mi/2003/06/m6117abs.htm;
                 http://csdl.computer.org/dl/mags/mi/2003/06/m6117.htm;
                 http://csdl.computer.org/dl/mags/mi/2003/06/m6117.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Book{Robbins:2003:USP,
  author =       "Kay A. Robbins and Steven Robbins",
  title =        "{UNIX} Systems programming: communication,
                 concurrency, and threads",
  publisher =    pub-PHPTR,
  address =      pub-PHPTR:adr,
  edition =      "Second",
  pages =        "xvii + 893",
  year =         "2003",
  ISBN =         "0-13-042411-0",
  ISBN-13 =      "978-0-13-042411-2",
  LCCN =         "QA76.76.O63 R6215 2003",
  bibdate =      "Wed Aug 20 21:08:15 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "See \cite{Robbins:1996:PUP} for first edition.",
  keywords =     "operating systems (computers); UNIX (computer file)",
}

@Article{Robison:2003:MCN,
  author =       "Arch D. Robison",
  title =        "Memory Consistency and {.NET}",
  journal =      j-DDJ,
  volume =       "28",
  number =       "4",
  pages =        "46, 48--50",
  month =        apr,
  year =         "2003",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jun 12 05:46:22 MDT 2003",
  bibsource =    "http://www.ddj.com/articles/2003/0304/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/documents/s=7827/ddj0304e/",
  abstract =     "Understanding the basics of memory consistency is
                 essential to writing multithreaded code that works on
                 both uniprocessors and multiprocessors.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Solihin:2003:CPU,
  author =       "Yan Solihin and Jaejin Lee and Josep Torrellas",
  title =        "Correlation Prefetching with a User-Level Memory
                 Thread",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "14",
  number =       "6",
  pages =        "563--580",
  month =        jun,
  year =         "2003",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2003.1206504",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Dec 24 10:02:07 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/trans/td/2003/06/l0563abs.htm;
                 http://csdl.computer.org/dl/trans/td/2003/06/l0563.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Swanson:2003:ESI,
  author =       "Steven Swanson and Luke K. McDowell and Michael M.
                 Swift and Susan J. Eggers and Henry M. Levy",
  title =        "An evaluation of speculative instruction execution on
                 simultaneous multithreaded processors",
  journal =      j-TOCS,
  volume =       "21",
  number =       "3",
  pages =        "314--340",
  month =        aug,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:26 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Thulasiram:2003:PEM,
  author =       "Ruppa K. Thulasiram and Parimala Thulasiraman",
  title =        "Performance Evaluation of a Multithreaded {Fast
                 Fourier Transform} Algorithm for Derivative Pricing",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "26",
  number =       "1",
  pages =        "43--58",
  month =        aug,
  year =         "2003",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Tue Dec 16 08:27:10 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/abstract.htm;
                 http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/fulltext.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Timmerman:2003:EWC,
  author =       "Martin Timmerman",
  title =        "Examining {Windows CE .NET}",
  journal =      j-DDJ,
  volume =       "28",
  number =       "2",
  pages =        "62, 64",
  month =        feb,
  year =         "2003",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jun 12 05:46:21 MDT 2003",
  bibsource =    "http://www.ddj.com/articles/2003/0302/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/documents/s=7790/ddj0302h/",
  abstract =     "Martin examines Windows CE .NET's thread handling and
                 advanced interrupt handling capabilities, as well as
                 its synchronization mechanisms and network stack
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Tremblay:2003:IEP,
  author =       "G. Tremblay and C. J. Morrone and J. N. Amaral and G.
                 R. Gao",
  title =        "Implementation of the {EARTH} programming model on
                 {SMP} clusters: a multi-threaded language and runtime
                 system",
  journal =      j-CCPE,
  volume =       "15",
  number =       "9",
  pages =        "821--844",
  day =          "10",
  month =        aug,
  year =         "2003",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.729",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Jan 13 09:28:12 MST 2004",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "14 Jul 2003",
}

@Article{Tseng:2003:DST,
  author =       "Y. Tseng and R. F. DeMara and P. J. Wilder",
  title =        "Distributed-sum termination detection supporting
                 multithreaded execution",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "29",
  number =       "7",
  pages =        "953--968",
  month =        jul,
  year =         "2003",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Dec 24 09:07:26 MST 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Ungerer:2003:SPE,
  author =       "Theo Ungerer and Borut Robi{\v{c}} and Jurij
                 {\v{S}}ilc",
  title =        "A survey of processors with explicit multithreading",
  journal =      j-COMP-SURV,
  volume =       "35",
  number =       "1",
  pages =        "29--63",
  month =        mar,
  year =         "2003",
  CODEN =        "CMSVAN",
  DOI =          "https://doi.org/10.1145/641865.641867",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Thu Jun 19 10:18:52 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/surveys/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Hardware multithreading is becoming a generally
                 applied technique in the next generation of
                 microprocessors. Several multithreaded processors are
                 announced by industry or already into production in the
                 areas of high-performance microprocessors, media, and
                 network processors. A multithreaded processor is able
                 to pursue two or more threads of control in parallel
                 within the processor pipeline. The contexts of two or
                 more threads of control are often stored in separate
                 on-chip register sets. Unused instruction slots, which
                 arise from latencies during the pipelined execution of
                 single-threaded programs by a contemporary
                 microprocessor, are filled by instructions of other
                 threads within a multithreaded processor. The execution
                 units are multiplexed between the thread contexts that
                 are loaded in the register sets. Underutilization of a
                 superscalar processor due to missing instruction-level
                 parallelism can be overcome by simultaneous
                 multithreading, where a processor can issue multiple
                 instructions from multiple threads each cycle.
                 Simultaneous multithreaded processors combine the
                 multithreading technique with a wide-issue superscalar
                 processor to utilize a larger part of the issue
                 bandwidth by issuing instructions from different
                 threads simultaneously. Explicit multithreaded
                 processors are multithreaded processors that apply
                 processes or operating system threads in their hardware
                 thread slots. These processors optimize the throughput
                 of multiprogramming workloads rather than single-thread
                 performance. We distinguish these processors from
                 implicit multithreaded processors that utilize
                 thread-level speculation by speculatively executing
                 compiler- or machine-generated threads of control that
                 are part of a single sequential program. This survey
                 paper explains and classifies the explicit
                 multithreading techniques in research and in commercial
                 microprocessors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
  keywords =     "Blocked multithreading; interleaved multithreading;
                 simultaneous multithreading",
}

@Article{vonPraun:2003:SCA,
  author =       "Christoph von Praun and Thomas R. Gross",
  title =        "Static conflict analysis for multi-threaded
                 object-oriented programs",
  journal =      j-SIGPLAN,
  volume =       "38",
  number =       "5",
  pages =        "115--128",
  month =        may,
  year =         "2003",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 11 12:45:00 MDT 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@InProceedings{Watcharawitch:2003:MME,
  author =       "Panit Watcharawitch and Simon W. Moore",
  title =        "{MulTEP}: {MulTithreaded Embedded Processors}",
  crossref =     "Anonymous:2003:CCV",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Fri Jan 09 17:02:42 2004",
  bibsource =    "http://www.coolchips.org/cool6/pdfDocuments/WEB05-Program_COOL6_2003.4.1.pdf;
                 https://www.math.utah.edu/pub/tex/bib/cool-chips.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{White:2003:UTL,
  author =       "Tom White",
  title =        "Using Thread-Local Variables In {Java}",
  journal =      j-DDJ,
  volume =       "28",
  number =       "7",
  pages =        "42, 44--46",
  month =        jul,
  year =         "2003",
  CODEN =        "DDJOEB",
  ISSN =         "1044-789X",
  bibdate =      "Thu Jun 12 05:46:24 MDT 2003",
  bibsource =    "http://www.ddj.com/articles/2003/0307/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.ddj.com/ftp/2003/2003_07/thread.txt;
                 http://www.ddj.com/ftp/2003/2003_07/thread.zip",
  abstract =     "Java's ThreadLocal class provides a powerful,
                 easy-to-use way to write efficient code that is safe
                 for multithreaded access. Additional resources include
                 thread.txt (listings) and thread.zip (source code).",
  acknowledgement = ack-nhfb,
  fjournal =     "Dr. Dobb's Journal of Software Tools",
}

@Article{Yong:2003:AMC,
  author =       "Xie Yong and Hsu Wen-Jing",
  title =        "Aligned Multithreaded Computations and Their
                 Scheduling with {FAB} Performance Guarantees",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "13",
  number =       "3",
  pages =        "353--??",
  month =        sep,
  year =         "2003",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Jan 06 09:41:03 2005",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Bhowmik:2004:GCF,
  author =       "Anasua Bhowmik and Manoj Franklin",
  title =        "A General Compiler Framework for Speculative
                 Multithreaded Processors",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "15",
  number =       "8",
  pages =        "713--724",
  month =        aug,
  year =         "2004",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2004.26",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Sat Dec 11 16:24:15 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/dl/trans/td/2004/08/l0713.htm;
                 http://csdl.computer.org/dl/trans/td/2004/08/l0713.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Bouchenak:2004:EIE,
  author =       "S. Bouchenak and D. Hagimont and S. Krakowiak and N.
                 De Palma and F. Boyer",
  title =        "Experiences implementing efficient {Java} thread
                 serialization, mobility and persistence",
  journal =      j-SPE,
  volume =       "34",
  number =       "4",
  pages =        "355--393",
  day =          "10",
  month =        apr,
  year =         "2004",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.569",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Sat Apr 16 07:26:28 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "5 Jan 2004",
}

@Article{Bucker:2004:TUC,
  author =       "H. M. Bucker and B. Lang and H. J. Pflug and A.
                 Vehreschild",
  title =        "Threads in an Undergraduate Course: a {Java} Example
                 Illuminating Different Multithreading Approaches",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "3044",
  pages =        "882--891",
  year =         "2004",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Sep 28 15:27:39 MDT 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chang:2004:TSP,
  author =       "B. M. Chang and J. D. Choi",
  title =        "Thread-Sensitive Points-to Analysis for Multithreaded
                 {Java} Programs",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "3280",
  pages =        "945--954",
  year =         "2004",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Mon Dec 6 06:44:22 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Chaudhuri:2004:SAN,
  author =       "Mainak Chaudhuri and Mark Heinrich",
  title =        "{SMTp}: {An Architecture} for {Next-generation
                 Scalable Multi-threading}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "124--124",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Flanagan:2004:ADA,
  author =       "Cormac Flanagan and Stephen N. Freund",
  title =        "Atomizer: a dynamic atomicity checker for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "1",
  pages =        "256--267",
  month =        jan,
  year =         "2004",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Apr 12 09:38:12 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Flanagan:2004:EPA,
  author =       "Cormac Flanagan and Stephen N. Freund and Shaz
                 Qadeer",
  title =        "Exploiting purity for atomicity",
  journal =      j-SIGSOFT,
  volume =       "29",
  number =       "4",
  pages =        "221--231",
  month =        jul,
  year =         "2004",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/1013886.1007543",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:14:35 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib",
  abstract =     "The notion that certain procedures are atomic is a
                 fundamental correctness property of many multithreaded
                 software systems. A procedure is atomic if for every
                 execution there is an equivalent serial execution in
                 which the actions performed by any thread while
                 executing the atomic procedure are not interleaved with
                 actions of other threads. Several existing tools verify
                 atomicity by using commutativity of actions to show
                 that every execution reduces to a corresponding serial
                 execution. However, experiments with these tools have
                 highlighted a number of interesting procedures that,
                 while intuitively atomic, are not reducible. In this
                 paper, we exploit the notion of pure code blocks to
                 verify the atomicity of such irreducible procedures. If
                 a pure block terminates normally, then its evaluation
                 does not change the program state, and hence these
                 evaluation steps can be removed from the program trace
                 before reduction. We develop a static analysis for
                 atomicity based on this insight, and we illustrate this
                 analysis on a number of interesting examples that could
                 not be verified using earlier tools based purely on
                 reduction. The techniques developed in this paper may
                 also be applicable in other approaches for verifying
                 atomicity, such as model checking and dynamic
                 analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Georges:2004:JPR,
  author =       "A. Georges and M. Christiaens and M. Ronsse and K. {De
                 Bosschere}",
  title =        "{JaRec}: a portable record\slash replay environment
                 for multi-threaded {Java} applications",
  journal =      j-SPE,
  volume =       "34",
  number =       "6",
  pages =        "523--547",
  month =        may,
  year =         "2004",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.579",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Sat Apr 16 07:26:29 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "24 Feb 2004",
}

@Article{Johnson:2004:MCP,
  author =       "Troy A. Johnson and Rudolf Eigenmann and T. N.
                 Vijaykumar",
  title =        "Min-cut program decomposition for thread-level
                 speculation",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "6",
  pages =        "59--70",
  month =        may,
  year =         "2004",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 2 05:49:55 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Johnston:2004:ADP,
  author =       "Wesley M. Johnston and J. R. Paul Hanna and Richard J.
                 Millar",
  title =        "Advances in dataflow programming languages",
  journal =      j-COMP-SURV,
  volume =       "36",
  number =       "1",
  pages =        "1--34",
  month =        mar,
  year =         "2004",
  CODEN =        "CMSVAN",
  DOI =          "https://doi.org/10.1145/1013208.1013209",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Thu Jun 19 10:19:47 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/surveys/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Many developments have taken place within dataflow
                 programming languages in the past decade. In
                 particular, there has been a great deal of activity and
                 advancement in the field of dataflow visual programming
                 languages. The motivation for this article is to review
                 the content of these recent developments and how they
                 came about. It is supported by an initial review of
                 dataflow programming in the 1970s and 1980s that led to
                 current topics of research. It then discusses how
                 dataflow programming evolved toward a hybrid von
                 Neumann dataflow formulation, and adopted a more
                 coarse-grained approach. Recent trends toward dataflow
                 visual programming languages are then discussed with
                 reference to key graphical dataflow languages and their
                 development environments. Finally, the article details
                 four key open topics in dataflow programming
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
  keywords =     "co-ordination languages; component software; data flow
                 visual programming; Dataflow; graphical programming;
                 multithreading; software engineering",
}

@Article{Kalla:2004:IPC,
  author =       "Ron Kalla and Balaram Sinharoy and Joel M. Tendler",
  title =        "{IBM Power5} Chip: a Dual-Core Multithreaded
                 Processor",
  journal =      j-IEEE-MICRO,
  volume =       "24",
  number =       "2",
  pages =        "40--47",
  month =        mar # "\slash " # apr,
  year =         "2004",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2004.1289290",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Sat Dec 11 17:59:16 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/mags/mi/2004/02/m2040abs.htm;
                 http://csdl.computer.org/dl/mags/mi/2004/02/m2040.htm;
                 http://csdl.computer.org/dl/mags/mi/2004/02/m2040.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Kapil:2004:CMP,
  author =       "Sanjiv Kapil and Harlan McGhan and Jesse Lawrendra",
  title =        "A Chip Multithreaded Processor for Network-Facing
                 Workloads",
  journal =      j-IEEE-MICRO,
  volume =       "24",
  number =       "2",
  pages =        "20--30",
  month =        mar # "\slash " # apr,
  year =         "2004",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2004.1289288",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Sat Dec 11 17:59:16 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/mags/mi/2004/02/m2020abs.htm;
                 http://csdl.computer.org/dl/mags/mi/2004/02/m2020.htm;
                 http://csdl.computer.org/dl/mags/mi/2004/02/m2020.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Kee:2004:MMM,
  author =       "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
  title =        "Memory management for multi-threaded software {DSM}
                 systems",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "30",
  number =       "1",
  pages =        "121--138",
  month =        jan,
  year =         "2004",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun Nov 7 05:53:52 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/01678191",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Krashinsky:2004:VTAa,
  author =       "Ronny Krashinsky and Christopher Batten and Mark
                 Hampton and Steve Gerding and Brian Pharris and Jared
                 Casper and Krste Asanovic",
  title =        "The Vector-Thread Architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "52--52",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Krashinsky:2004:VTAb,
  author =       "Ronny Krashinsky and Christopher Batten and Mark
                 Hampton and Steve Gerding and Brian Pharris and Jared
                 Casper and Krste Asanovic",
  title =        "The Vector-Thread Architecture",
  journal =      j-IEEE-MICRO,
  volume =       "24",
  number =       "6",
  pages =        "84--90",
  month =        nov # "\slash " # dec,
  year =         "2004",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2004.90",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 20 08:11:28 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/dl/mags/mi/2004/06/m6084.htm;
                 http://csdl.computer.org/dl/mags/mi/2004/06/m6084.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Kumar:2004:AST,
  author =       "Nagendra J. Kumar and Siddhartha Shivshankar and
                 Alexander G. Dean",
  title =        "Asynchronous software thread integration for efficient
                 software",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "7",
  pages =        "37--46",
  month =        jul,
  year =         "2004",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 2 05:49:55 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Kumar:2004:SIH,
  author =       "Rakesh Kumar and Dean M. Tullsen and Parthasarathy
                 Ranganathan and Norman P. Jouppi and Keith I. Farkas",
  title =        "Single-{ISA} Heterogeneous Multi-Core Architectures
                 for Multithreaded Workload Performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "64--64",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Lemon:2004:MCR,
  author =       "Oliver Lemon and Alexander Gruenstein",
  title =        "Multithreaded context for robust conversational
                 interfaces: {Context-sensitive} speech recognition and
                 interpretation of corrective fragments",
  journal =      j-TOCHI,
  volume =       "11",
  number =       "3",
  pages =        "241--267",
  month =        sep,
  year =         "2004",
  CODEN =        "ATCIF4",
  ISSN =         "1073-0516 (print), 1557-7325 (electronic)",
  ISSN-L =       "1073-0516",
  bibdate =      "Thu Nov 4 08:26:36 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tochi/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tochi.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Computer-Human Interaction",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J756",
}

@Article{Li:2004:FRT,
  author =       "S. Q. Li and H. Y. Chen and Y. X. Su",
  title =        "A Framework of Reachability Testing for {Java}
                 Multithread Programs",
  journal =      "IEEE International Conference on Systems Man and
                 Cybernetics",
  volume =       "3",
  pages =        "2730--2734",
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1062-922X",
  bibdate =      "Thu Mar 24 17:43:34 MST 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
}

@Article{Maris:2004:CCP,
  author =       "Justin T. Maris and Aaron W. Keen and Takashi Ishihara
                 and Ronald A. Olsson",
  title =        "A comparison of concurrent programming and cooperative
                 multithreading under load balancing applications",
  journal =      j-CCPE,
  volume =       "16",
  number =       "4",
  pages =        "345--369",
  day =          "10",
  month =        apr,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.751",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:53 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "19 Jan 2004",
}

@Article{Marowka:2004:OOA,
  author =       "Ami Marowka and Zhenying Liu and Barbara Chapman",
  title =        "{OpenMP}-oriented applications for distributed shared
                 memory architectures",
  journal =      j-CCPE,
  volume =       "16",
  number =       "4",
  pages =        "371--384",
  day =          "10",
  month =        apr,
  year =         "2004",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.752",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat May 14 11:30:53 MDT 2005",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "19 Jan 2004",
}

@Article{Martin:2004:HPA,
  author =       "Mar{\'\i}a J. Mart{\'\i}n and Marta Parada and
                 Ram{\'o}n Doallo",
  title =        "High Performance Air Pollution Simulation Using
                 {OpenMP}",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "28",
  number =       "3",
  pages =        "311--321",
  month =        jun,
  year =         "2004",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Dec 4 12:39:13 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.wkap.nl/journalhome.htm/0920-8542",
  URL =          "http://ipsapp008.kluweronline.com/IPS/content/ext/x/J/5189/I/54/A/5/abstract.htm",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Michael:2004:SLF,
  author =       "Maged M. Michael",
  title =        "Scalable lock-free dynamic memory allocation",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "6",
  pages =        "35--46",
  month =        may,
  year =         "2004",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/996841.996848",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 2 05:49:55 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Dynamic memory allocators (malloc/free) rely on mutual
                 exclusion locks for protecting the consistency of their
                 shared data structures under multithreading. The use of
                 locking has many disadvantages with respect to
                 performance, availability, robustness, and programming
                 flexibility. A lock-free memory allocator guarantees
                 progress regardless of whether some threads are delayed
                 or even killed and regardless of scheduling policies.
                 This paper presents a completely lock-free memory
                 allocator. It uses only widely-available operating
                 system support and hardware atomic instructions. It
                 offers guaranteed availability even under arbitrary
                 thread termination and crash-failure, and it is immune
                 to deadlock regardless of scheduling policies, and
                 hence it can be used even in interrupt handlers and
                 real-time applications without requiring special
                 scheduler support. Also, by leveraging some high-level
                 structures from Hoard, our allocator is highly
                 scalable, limits space blowup to a constant factor, and
                 is capable of avoiding false sharing. In addition, our
                 allocator allows finer concurrency and much lower
                 latency than Hoard. We use PowerPC shared memory
                 multiprocessor systems to compare the performance of
                 our allocator with the default AIX 5.1 libc malloc, and
                 two widely-used multithread allocators, Hoard and
                 Ptmalloc. Our allocator outperforms the other
                 allocators in virtually all cases and often by
                 substantial margins, under various levels of
                 parallelism and allocation patterns. Furthermore, our
                 allocator also offers the lowest contention-free
                 latency among the allocators by significant margins.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Omma:2004:BMA,
  author =       "M. Omma",
  title =        "On building multithreaded applications",
  journal =      j-IEEE-DISTRIB-SYST-ONLINE,
  volume =       "5",
  number =       "4",
  pages =        "1--3",
  month =        apr,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/MDSO.2004.1301256",
  ISSN =         "1541-4922 (print), 1558-1683 (electronic)",
  ISSN-L =       "1541-4922",
  bibdate =      "Fri Jul 15 17:50:15 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/iel5/8968/28913/01301256.pdf?isnumber=28913&prod=JNL&arnumber=1301256&arSt=+1&ared=+3&arAuthor=Omma%2C+M.;
                 http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28913&arnumber=1301256&count=5&index=3",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Distributed Systems Online",
}

@Article{Pfeffer:2004:RTG,
  author =       "M. Pfeffer and T. Ungerer and S. Fuhrmann and J.
                 Kreuzinger and U. Brinkschulte",
  title =        "Real-Time Garbage Collection for a Multithreaded
                 {Java} Microcontroller",
  journal =      j-REAL-TIME-SYST,
  volume =       "26",
  number =       "1",
  pages =        "89--106",
  year =         "2004",
  CODEN =        "RESYE9",
  ISSN =         "0922-6443",
  ISSN-L =       "0922-6443",
  bibdate =      "Mon Jan 5 17:25:38 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Real-Time Systems",
}

@Article{Robatmili:2004:TSI,
  author =       "B. Robatmili and N. Yazdani and S. Sardashti and M.
                 Nourani",
  title =        "Thread-Sensitive Instruction Issue for {SMT}
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous Multi Threading (SMT) is a processor
                 design method in which concurrent hardware threads
                 share processor resources like functional units and
                 memory. The scheduling complexity and performance of an
                 SMT processor depend on the topology used in the fetch
                 and issue stages. In this paper, we propose a thread
                 sensitive issue policy for a partitioned SMT processor
                 which is based on a thread metric. We propose the
                 number of ready-to-issue instructions of each thread as
                 priority metric. To evaluate our method, we have
                 developed a reconfigurable SMT-simulator on top of the
                 SimpleScalar Toolset. We simulated our modeled
                 processor under several workloads composed of SPEC
                 benchmarks. Experimental results show around 30\%
                 improvement compared to the conventional OLDEST\_FIRST
                 mixed topology issue policy. Additionally, the hardware
                 implementation of our architecture with this metric in
                 issue stage is quite simple.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Delay; Frequency; Intrusion detection;
                 Laboratories; Logic; Processor scheduling;
                 Surface-mount technology; Topology",
}

@Article{Roth:2004:MTC,
  author =       "Marcus Roth and Gerrit Voss and Dirk Reiners",
  title =        "Multi-threading and clustering for scene graph
                 systems",
  journal =      j-COMPUTERS-AND-GRAPHICS,
  volume =       "28",
  number =       "1",
  pages =        "63--66",
  month =        feb,
  year =         "2004",
  CODEN =        "COGRD2",
  ISSN =         "0097-8493 (print), 1873-7684 (electronic)",
  ISSN-L =       "0097-8493",
  bibdate =      "Tue Jan 27 12:04:28 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/00978493",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Graphics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00978493",
}

@Article{Sanden:2004:CJT,
  author =       "B. Sanden",
  title =        "Coping with {Java} Threads: {Java} works for many
                 kinds of concurrent software, but it was not designed
                 for safety-critical real-time applications and does not
                 protect the programmer from the pitfalls associated
                 with multithreading",
  journal =      j-COMPUTER,
  volume =       "37",
  number =       "4",
  pages =        "20--27",
  year =         "2004",
  CODEN =        "CPTRB4",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Mon May 17 14:50:36 MDT 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Shin:2004:NAD,
  author =       "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot",
  title =        "The Need for Adaptive Dynamic Thread Scheduling in
                 Simultaneous Multithreading",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "14",
  number =       "3/4",
  pages =        "327--??",
  month =        sep # "\slash " # dec,
  year =         "2004",
  CODEN =        "PPLTEE",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Jul 7 07:41:25 MDT 2005",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Thulasiraman:2004:FGL,
  author =       "Parimala Thulasiraman and Ashfaq A. Khokhar and Gerd
                 Heber and Guang R. Gao",
  title =        "A fine-grain load-adaptive algorithm of the {$2$D}
                 discrete wavelet transform for multithreaded
                 architectures",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "64",
  number =       "1",
  pages =        "68--78",
  month =        jan,
  year =         "2004",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Dec 4 15:15:08 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Tolmach:2004:IFL,
  author =       "Andrew Tolmach and Sergio Antoy and Marius Nita",
  title =        "Implementing functional logic languages using multiple
                 threads and stores",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "9",
  pages =        "90--102",
  month =        sep,
  year =         "2004",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 2 05:49:56 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Vrenios:2004:PPC,
  author =       "A. Vrenios",
  title =        "{Parallel Programming in C with MPI and OpenMP} [Book
                 Review]",
  journal =      j-IEEE-DISTRIB-SYST-ONLINE,
  volume =       "5",
  number =       "1",
  pages =        "7.1--7.3",
  month =        "????",
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1541-4922 (print), 1558-1683 (electronic)",
  ISSN-L =       "1541-4922",
  bibdate =      "Fri Jul 15 17:50:13 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/iel5/8968/28452/01270716.pdf?isnumber=28452&prod=JNL&arnumber=1270716&arSt=+7.1&ared=+7.3&arAuthor=Vrenios%2C+A.;
                 http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28452&arnumber=1270716&count=8&index=5",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Distributed Systems Online",
}

@Article{Wang:2004:HTVa,
  author =       "Perry H. Wang and Jamison D. Collins and Hong Wang and
                 Dongkeun Kim and Bill Greene and Kai-Ming Chan and
                 Aamir B. Yunus and Terry Sych and Stephen F. Moore and
                 John P. Shen",
  title =        "Helper threads via virtual multithreading on an
                 experimental {Itanium-2} processor-based platform",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "144--155",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Wang:2004:HTVb,
  author =       "Perry H. Wang and Jamison D. Collins and Hong Wang and
                 Dongkeun Kim and Bill Greene and Kai-Ming Chan and
                 Aamir B. Yunus and Terry Sych and Stephen F. Moore and
                 John P. Shen",
  title =        "Helper threads via virtual multithreading on an
                 experimental {Itanium-2} processor-based platform",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "11",
  pages =        "144--155",
  month =        nov,
  year =         "2004",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Apr 12 09:38:13 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Wang:2004:HTVc,
  author =       "Perry H. Wang and Jamison D. Collins and Hong Wang and
                 Dongkeun Kim and Bill Greene and Kai-Ming Chan and
                 Aamir B. Yunus and Terry Sych and Stephen F. Moore and
                 John P. Shen",
  title =        "Helper threads via virtual multithreading on an
                 experimental {Itanium-2} processor-based platform",
  journal =      j-OPER-SYS-REV,
  volume =       "38",
  number =       "5",
  pages =        "144--155",
  month =        dec,
  year =         "2004",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Wang:2004:HTVd,
  author =       "Perry H. Wang and Jamison D. Collins and Hong Wang and
                 Dongkeun Kim and Bill Greene and Kai-Ming Chan and
                 Aamir B. Yunus and Terry Sych and Stephen F. Moore and
                 John P. Shen",
  title =        "Helper Threads via Virtual Multithreading",
  journal =      j-IEEE-MICRO,
  volume =       "24",
  number =       "6",
  pages =        "74--82",
  month =        nov # "\slash " # dec,
  year =         "2004",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2004.75",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 20 08:11:28 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/dl/mags/mi/2004/06/m6074.htm;
                 http://csdl.computer.org/dl/mags/mi/2004/06/m6074.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Zhuang:2004:BRA,
  author =       "Xiaotong Zhuang and Santosh Pande",
  title =        "Balancing register allocation across threads for a
                 multithreaded network processor",
  journal =      j-SIGPLAN,
  volume =       "39",
  number =       "6",
  pages =        "289--300",
  month =        may,
  year =         "2004",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 2 05:49:55 MST 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Abraham:2005:ABP,
  author =       "Erika {\'A}brah{\'a}m and Frank S. de Boer and
                 Willem-Paul de Roever and Martin Steffen",
  title =        "An assertion-based proof system for multithreaded
                 {Java}",
  journal =      j-THEOR-COMP-SCI,
  volume =       "331",
  number =       "2--3",
  pages =        "251--290",
  day =          "25",
  month =        feb,
  year =         "2005",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Fri Jul 8 14:05:15 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/03043975",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@Article{Anonymous:2005:ECS,
  author =       "Anonymous",
  title =        "Errata: {{\em Characterization of Simultaneous
                 Multithreading (SMT) Efficiency in POWER5}}",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "6",
  pages =        "1003--??",
  month =        nov,
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Fri Feb 9 21:39:23 MST 2007",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  note =         "See \cite{Mathis:2005:CSM}.",
  URL =          "http://www.research.ibm.com/journal/rd/496/errata.html",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0245-00",
}

@Article{Barabash:2005:PIM,
  author =       "Katherine Barabash and Ori Ben-Yitzhak and Irit Goft
                 and Elliot K. Kolodner and Victor Leikehman and Yoav
                 Ossia and Avi Owshanko and Erez Petrank",
  title =        "A parallel, incremental, mostly concurrent garbage
                 collector for servers",
  journal =      j-TOPLAS,
  volume =       "27",
  number =       "6",
  pages =        "1097--1146",
  month =        nov,
  year =         "2005",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1108970.1108972",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Wed Jan 11 05:23:15 MST 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreaded applications with multigigabyte heaps
                 running on modern servers provide new challenges for
                 garbage collection (GC). The challenges for
                 ``server-oriented'' GC include: ensuring short pause
                 times on a multigigabyte heap while minimizing
                 throughput penalty, good scaling on multiprocessor
                 hardware, and keeping the number of expensive
                 multicycle fence instructions required by weak ordering
                 to a minimum. We designed and implemented a collector
                 facing these demands building on the mostly concurrent
                 garbage collector proposed by Boehm et al. [1991]. Our
                 collector incorporates new ideas into the original
                 collector. We make it parallel and incremental; we
                 employ concurrent low-priority background GC threads to
                 take advantage of processor idle time; we propose novel
                 algorithmic improvements to the basic mostly concurrent
                 algorithm improving its efficiency and shortening its
                 pause times; and finally, we use advanced techniques,
                 such as a low-overhead work packet mechanism to enable
                 full parallelism among the incremental and concurrent
                 collecting threads and ensure load balancing. We
                 compared the new collector to the mature,
                 well-optimized, parallel, stop-the-world mark-sweep
                 collector already in the IBM JVM. When allowed to run
                 aggressively, using 72\% of the CPU utilization during
                 a short concurrent phase, our collector prototype
                 reduces the maximum pause time from 161 ms to 46 ms
                 while only losing 11.5\% throughput when running the
                 SPECjbb2000 benchmark on a 600-MB heap on an 8-way
                 PowerPC 1.1-GHz processors. When the collector is
                 limited to a nonintrusive operation using only 29\% of
                 the CPU utilization, the maximum pause time obtained is
                 79 ms and the loss in throughput is 15.4\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Basharahil:2005:DSA,
  author =       "Ramzi Basharahil and Brian Wims and Cheng-Zhong Xu and
                 Song Fu",
  title =        "Distributed Shared Arrays: An Integration of Message
                 Passing and Multithreading on {SMP} Clusters",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "31",
  number =       "2",
  pages =        "161--184",
  month =        feb,
  year =         "2005",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-005-0041-5",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 6 10:36:19 MDT 2005",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=31&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=31&issue=2&spage=161",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Boehm:2005:TCI,
  author =       "Hans-J. Boehm",
  title =        "Threads cannot be implemented as a library",
  journal =      j-SIGPLAN,
  volume =       "40",
  number =       "6",
  pages =        "261--268",
  month =        jun,
  year =         "2005",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1065010.1065042",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 21 17:04:05 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In many environments, multi-threaded code is written
                 in a language that was originally designed without
                 thread support (e.g. C), to which a library of
                 threading primitives was subsequently added. There
                 appears to be a general understanding that this is not
                 the right approach. We provide specific arguments that
                 a pure library approach, in which the compiler is
                 designed independently of threading issues, cannot
                 guarantee correctness of the resulting code. We first
                 review why the approach almost works, and then examine
                 some of the surprising behavior it may entail. We
                 further illustrate that there are very simple cases in
                 which a pure library-based approach seems incapable of
                 expressing an efficient parallel algorithm. Our
                 discussion takes place in the context of C with
                 Pthreads, since it is commonly used, reasonably well
                 specified, and does not attempt to ensure type-safety,
                 which would entail even stronger constraints. The
                 issues we raise are not specific to that context.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "This is an important paper: it shows that current
                 languages cannot be reliable for threaded programming
                 without language changes that prevent compiler
                 optimizations from foiling synchronization methods and
                 memory barriers. The article's author and others are
                 collaborating on a proposal for changes to the C++
                 language to remedy this, but that still leaves threads
                 unreliable in C code, even with POSIX threads.",
}

@Article{Boroday:2005:DAJ,
  author =       "S. Boroday and A. Petrenko and J. Singh and H.
                 Hallal",
  title =        "Dynamic analysis of {Java} applications for
                 multithreaded antipatterns",
  journal =      j-SIGSOFT,
  volume =       "30",
  number =       "4",
  pages =        "1--7",
  month =        jul,
  year =         "2005",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/1082983.1083247",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:14:51 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib",
  abstract =     "Formal verification is not always applicable to large
                 industrial software systems due to scalability issues
                 and difficulties in formal model and requirements
                 specification. The scalability and model derivation
                 problems could be alleviated by runtime trace analysis,
                 which combines both testing and formal verification. We
                 implement and compare an ad-hoc custom approach and a
                 formal approach to detect common bug patterns in
                 multithreaded Java software. We use the tracing
                 platform of the Eclipse IDE and state-of-the-art model
                 checker Spin.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Brinkschulte:2005:ICA,
  author =       "U. Brinkschulte and M. Pacher",
  title =        "Implementing Control Algorithms Within a Multithreaded
                 {Java} Microcontroller",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "3432",
  pages =        "33--49",
  year =         "2005",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  ISSN-L =       "0302-9743",
  bibdate =      "Tue Apr 26 10:50:23 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 Ingenta database",
  acknowledgement = ack-nhfb,
  fjournal =     "Lecture Notes in Computer Science",
}

@Article{Constantinou:2005:PIS,
  author =       "Theofanis Constantinou and Yiannakis Sazeides and
                 Pierre Michaud and Damien Fetis and Andre Seznec",
  title =        "Performance implications of single thread migration on
                 a chip multi-core",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "80--91",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Flanagan:2005:MVM,
  author =       "Cormac Flanagan and Stephen N. Freund and Shaz Qadeer
                 and Sanjit A. Seshia",
  title =        "Modular verification of multithreaded programs",
  journal =      j-THEOR-COMP-SCI,
  volume =       "338",
  number =       "1--3",
  pages =        "153--183",
  day =          "10",
  month =        jun,
  year =         "2005",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Fri Jul 8 14:05:16 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/03043975",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@TechReport{Garcia:2005:HJA,
  author =       "P. Garcia and H. F. Korth",
  title =        "Hash-join algorithms on modern multithreaded computer
                 architectures",
  type =         "Report",
  number =       "LUCSE-05-001",
  institution =  "Lehigh University",
  address =      "Bethlehem, PA, USA",
  month =        "????",
  year =         "2005",
  bibdate =      "Mon Dec 10 07:05:38 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Giampapa:2005:BGA,
  author =       "M. E. Giampapa and R. Bellofatto and M. A. Blumrich
                 and D. Chen and M. B. Dombrowa and A. Gara and R. A.
                 Haring and P. Heidelberger and D. Hoenicke and G. V.
                 Kopcsay and B. J. Nathanson and B. D. Steinmacher-Burow
                 and M. Ohmacht and V. Salapura and P. Vranas",
  title =        "{Blue Gene/L} advanced diagnostics environment",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "2/",
  pages =        "319--331",
  month =        "????",
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Jun 1 08:14:41 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/492/giampapa.pdf",
  abstract =     "This paper describes the Blue Gene/L advanced
                 diagnostics environment (ADE) used throughout all
                 aspects of the Blue Gene/L project, including design,
                 logic verification, bringup, diagnostics, and
                 manufacturing test. The Blue Gene/L ADE consists of a
                 lightweight multithreaded coherence-managed kernel,
                 runtime libraries, device drivers, system programming
                 interfaces, compilers, and host-based development
                 tools. It provides complete and flexible access to all
                 features of the Blue Gene/L hardware. Prior to the
                 existence of hardware, ADE was used on Very high-speed
                 integrated circuit Hardware Description Language (VHDL)
                 models, not only for logic verification, but also for
                 performance measurements, code-path analysis, and
                 evaluation of architectural tradeoffs. During early
                 hardware bring-up, the ability to run in a
                 cycle-reproducible manner on both hardware and VHDL
                 proved invaluable in fault isolation and analysis.
                 However, ADE is also capable of supporting
                 high-performance applications and parallel test cases,
                 thereby permitting us to stress the hardware to the
                 limits of its capabilities. This paper also provides
                 insights into system-level and device-level programming
                 of Blue Gene/L to assist developers of high-performance
                 applications to more fully exploit the performance of
                 the machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0240",
}

@Article{Gil:2005:TCS,
  author =       "Marisa Gil and Ruben Pinilla",
  title =        "Thread coloring: a scheduler proposal from user to
                 hardware threads",
  journal =      j-OPER-SYS-REV,
  volume =       "39",
  number =       "2",
  pages =        "54--70",
  month =        apr,
  year =         "2005",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Gustafsson:2005:TP,
  author =       "Andreas Gustafsson",
  title =        "Threads without the pain",
  journal =      j-QUEUE,
  volume =       "3",
  number =       "9",
  pages =        "42--47",
  month =        nov,
  year =         "2005",
  CODEN =        "AQCUAE",
  ISSN =         "1542-7730 (print), 1542-7749 (electronic)",
  ISSN-L =       "1542-7730",
  bibdate =      "Sat Dec 17 07:37:28 MST 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Queue: Tomorrow's Computing Today",
}

@Article{Keller:2005:TBV,
  author =       "J{\"o}rg Keller and Andreas Gr{\"a}vinghoff",
  title =        "Thread-Based Virtual Duplex Systems in Embedded
                 Environments",
  journal =      j-IEEE-MICRO,
  volume =       "25",
  number =       "2",
  pages =        "60--69",
  month =        mar # "\slash " # apr,
  year =         "2005",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2005.39",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 20 08:11:29 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/mags/mi/2005/02/m2060abs.htm;
                 http://csdl.computer.org/dl/mags/mi/2005/02/m2060.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Kongetira:2005:NWM,
  author =       "Poonacha Kongetira and Kathirgamar Aingaran and Kunle
                 Olukotun",
  title =        "{Niagara}: a 32-Way Multithreaded {Sparc} Processor",
  journal =      j-IEEE-MICRO,
  volume =       "25",
  number =       "2",
  pages =        "21--29",
  month =        mar # "\slash " # apr,
  year =         "2005",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2005.35",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 20 08:11:29 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/mags/mi/2005/02/m2021abs.htm;
                 http://csdl.computer.org/dl/mags/mi/2005/02/m2021.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Li:2005:OSA,
  author =       "Xiaoye S. Li",
  title =        "An overview of {SuperLU}: {Algorithms},
                 implementation, and user interface",
  journal =      j-TOMS,
  volume =       "31",
  number =       "3",
  pages =        "302--325",
  month =        sep,
  year =         "2005",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1089014.1089017",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Oct 5 07:43:35 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We give an overview of the algorithms, design
                 philosophy, and implementation techniques in the
                 software SuperLU, for solving sparse unsymmetric linear
                 systems. In particular, we highlight the differences
                 between the sequential SuperLU (including its
                 multithreaded extension) and parallel SuperLU_DIST.
                 These include the numerical pivoting strategy, the
                 ordering strategy for preserving sparsity, the ordering
                 in which the updating tasks are performed, the
                 numerical kernel, and the parallelization strategy.
                 Because of the scalability concern, the parallel code
                 is drastically different from the sequential one. We
                 describe the user interfaces of the libraries, and
                 illustrate how to use the libraries most efficiently
                 depending on some matrix characteristics. Finally, we
                 give some examples of how the solver has been used in
                 large-scale scientific applications, and the
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Loepere:2005:STM,
  author =       "Keith Loepere",
  title =        "Stackable thread mechanisms",
  journal =      j-OPER-SYS-REV,
  volume =       "39",
  number =       "4",
  pages =        "4--17",
  month =        oct,
  year =         "2005",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Mathis:2005:CSM,
  author =       "H. M. Mathis and A. E. Mericas and J. D. McCalpin and
                 R. J. Eickemeyer and S. R. Kunkel",
  title =        "Characterization of simultaneous multithreading
                 ({SMT}) efficiency in {POWER5}",
  journal =      j-IBM-JRD,
  volume =       "49",
  number =       "4/5",
  pages =        "555--564",
  month =        "????",
  year =         "2005",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Oct 5 07:12:31 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/494/mathis.html",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{McNairy:2005:MDC,
  author =       "Cameron McNairy and Rohit Bhatia",
  title =        "{Montecito}: a Dual-Core, Dual-Thread {Itanium}
                 Processor",
  journal =      j-IEEE-MICRO,
  volume =       "25",
  number =       "2",
  pages =        "10--20",
  month =        mar # "\slash " # apr,
  year =         "2005",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2005.34",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Apr 20 08:11:29 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/mags/mi/2005/02/m2010abs.htm;
                 http://csdl.computer.org/dl/mags/mi/2005/02/m2010.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Mudigonda:2005:MMA,
  author =       "Jayaram Mudigonda and Harrick M. Vin and Raj
                 Yavatkar",
  title =        "Managing memory access latency in packet processing",
  journal =      j-SIGMETRICS,
  volume =       "33",
  number =       "1",
  pages =        "396--397",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1064212.1064272",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Jun 27 09:21:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this study, we refute the popular belief [1,2] that
                 packet processing does not benefit from data-caching.
                 We show that a small data-cache of 8KB can bring down
                 the packet processing time by much as 50-90\%, while
                 reducing the off-chip memory bandwidth usage by about
                 60-95\%. We also show that, unlike general-purpose
                 computing, packet processing, due to its
                 memory-intensive nature, cannot rely exclusively on
                 data-caching to eliminate the memory bottleneck
                 completely.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "data-caches; multithreading; network processors",
}

@Article{Petric:2005:EEP,
  author =       "Vlad Petric and Amir Roth",
  title =        "Energy-Effectiveness of Pre-Execution and Energy-Aware
                 {P}-Thread Selection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "322--333",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Ruan:2005:EIS,
  author =       "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John
                 M. Tracey",
  title =        "Evaluating the impact of simultaneous multithreading
                 on network servers using real hardware",
  journal =      j-SIGMETRICS,
  volume =       "33",
  number =       "1",
  pages =        "315--326",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1071690.1064254",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Jun 27 09:21:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper examines the performance of simultaneous
                 multithreading (SMT) for network servers using actual
                 hardware, multiple network server applications, and
                 several workloads. Using three versions of the Intel
                 Xeon processor with Hyper-Threading, we perform
                 macroscopic analysis as well as microarchitectural
                 measurements to understand the origins of the
                 performance bottlenecks for SMT processors in these
                 environments. The results of our evaluation suggest
                 that the current SMT support in the Xeon is application
                 and workload sensitive, and may not yield significant
                 benefits for network servers. In general, we find that
                 enabling SMT on real hardware usually produces only
                 slight performance gains, and can sometimes lead to
                 performance loss. In the uniprocessor case, previous
                 studies appear to have neglected the OS overhead in
                 switching from a uniprocessor kernel to an SMT-enabled
                 kernel. The performance loss associated with such
                 support is comparable to the gains provided by SMT. In
                 the 2-way multiprocessor case, the higher number of
                 memory references from SMT often causes the memory
                 system to become the bottleneck, offsetting any
                 processor utilization gains. This effect is compounded
                 by the growing gap between processor speeds and memory
                 latency. In trying to understand the large gains shown
                 by simulation studies, we find that while the general
                 trends for microarchitectural behavior agree with real
                 hardware, differences in sizing assumptions and
                 performance models yield much more optimistic benefits
                 for SMT than we observe.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "network server; simultaneous multithreading(SMT)",
}

@Article{Rufai:2005:MPO,
  author =       "Raimi Rufai and Muslim Bozyigit and Jaralla Alghamdi
                 and Moataz Ahmed",
  title =        "Multithreaded Parallelism with {OpenMP}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "15",
  number =       "4",
  pages =        "367--378",
  month =        dec,
  year =         "2005",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626405002283",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Sendag:2005:IIS,
  author =       "Resit Sendag and Ying Chen and David J. Lilja",
  title =        "The Impact of Incorrectly Speculated Memory Operations
                 in a Multithreaded Architecture",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "16",
  number =       "3",
  pages =        "271--285",
  month =        mar,
  year =         "2005",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2005.36",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Nov 10 08:30:29 MST 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Shinjo:2005:AEP,
  author =       "Y. Shinjo and C. Pu",
  title =        "Achieving efficiency and portability in systems
                 software: a case study on {POSIX}-compliant
                 multithreaded programs",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "31",
  number =       "9",
  pages =        "785--800",
  month =        sep,
  year =         "2005",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2005.98",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Feb 1 11:00:42 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1514446",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Article{Stark:2005:FSV,
  author =       "Robert F. St{\"a}rk",
  title =        "Formal specification and verification of the {C\#}
                 thread model",
  journal =      j-THEOR-COMP-SCI,
  volume =       "343",
  number =       "3",
  pages =        "482--508",
  day =          "17",
  month =        oct,
  year =         "2005",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Tue Mar 29 06:48:50 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/03043975",
  abstract =     "We present a high-level Abstract State Machine (ASM)
                 model of C\# threads and the .NET memory model. We
                 focus on purely managed, fully portable threading
                 features of C\#. The sequential model interleaves the
                 computation steps of the currently running threads and
                 is suitable for uniprocessors. The parallel model
                 addresses problems of true concurrency on
                 multi-processor systems. The models provide a sound
                 basis for the development of multi-threaded
                 applications in C\#. The thread and memory models
                 complete the abstract operational semantics of C\# in
                 [B{\"o}rger et al. Theoret. Comput. Sci., to appear].
                 The main invariants of the thread model concerning
                 locks, monitors and mutual exclusion are formally
                 verified in the AsmTP system, an interactive proof
                 assistant based on ASM logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@Article{Steinke:2005:NPF,
  author =       "Robert Steinke and Micah Clark and Elihu McMahon",
  title =        "A new pattern for flexible worker threads with
                 in-place consumption message queues",
  journal =      j-OPER-SYS-REV,
  volume =       "39",
  number =       "2",
  pages =        "71--73",
  month =        apr,
  year =         "2005",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Sat Aug 26 08:55:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
}

@Article{Sundell:2005:FLF,
  author =       "H{\aa}kan Sundell and Philippas Tsigas",
  title =        "Fast and lock-free concurrent priority queues for
                 multi-thread systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "65",
  number =       "5",
  pages =        "609--627",
  month =        may,
  year =         "2005",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:33 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Tian:2005:PCT,
  author =       "Xinmin Tian and Milind Girkar and Aart Bik and Hideki
                 Saito",
  title =        "Practical Compiler Techniques on Efficient
                 Multithreaded Code Generation for {OpenMP} Programs",
  journal =      j-COMP-J,
  volume =       "48",
  number =       "5",
  pages =        "588--601",
  month =        sep,
  year =         "2005",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxh109",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Tue Nov 8 05:58:50 MST 2005",
  bibsource =    "http://comjnl.oxfordjournals.org/content/vol48/issue5/index.dtl;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://comjnl.oxfordjournals.org/cgi/content/abstract/48/5/588;
                 http://comjnl.oxfordjournals.org/cgi/reprint/48/5/588",
  acknowledgement = ack-nhfb,
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
}

@Article{Vachharajani:2005:CMP,
  author =       "Neil Vachharajani and Matthew Iyer and Chinmay Ashok
                 and Manish Vachharajani and David I. August and Daniel
                 Connors",
  title =        "Chip multi-processor scalability for single-threaded
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "44--53",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Abadi:2006:TSL,
  author =       "Martin Abadi and Cormac Flanagan and Stephen N.
                 Freund",
  title =        "Types for safe locking: {Static} race detection for
                 {Java}",
  journal =      j-TOPLAS,
  volume =       "28",
  number =       "2",
  pages =        "207--255",
  month =        mar,
  year =         "2006",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1119479.1119480",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Mar 10 18:46:58 MST 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This article presents a static race-detection analysis
                 for multithreaded shared-memory programs, focusing on
                 the Java programming language. The analysis is based on
                 a type system that captures many common synchronization
                 patterns. It supports classes with internal
                 synchronization, classes that require client-side
                 synchronization, and thread-local classes. In order to
                 demonstrate the effectiveness of the type system, we
                 have implemented it in a checker and applied it to over
                 40,000 lines of hand-annotated Java code. We found a
                 number of race conditions in the standard Java
                 libraries and other test programs. The checker required
                 fewer than 20 additional type annotations per 1,000
                 lines of code. This article also describes two
                 improvements that facilitate checking much larger
                 programs: an algorithm for annotation inference and a
                 user interface that clarifies warnings generated by the
                 checker. These extensions have enabled us to use the
                 checker for identifying race conditions in large-scale
                 software systems with up to 500,000 lines of code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@TechReport{Aciicmez:2006:PSB,
  author =       "Onur Acii{\c{c}}mez and {\c{C}}etin Kaya Ko{\c{c}} and
                 Jean-Pierre Seifert",
  title =        "On the Power of Simple Branch Prediction Analysis",
  type =         "Technical report",
  institution =  "School of EECS, Oregon State University",
  address =      "Corvallis, OR 97331, USA",
  month =        oct,
  year =         "2006",
  bibdate =      "Mon Nov 20 14:57:23 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://eprint.iacr.org/2006/351;
                 http://eprint.iacr.org/2006/351.pdf",
  abstract =     "Very recently, a new software side-channel attack,
                 called Branch Prediction Analysis (BPA) attack, has
                 been discovered and also demonstrated to be practically
                 feasible on popular commodity PC platforms. While the
                 above recent attack still had the flavor of a classical
                 timing attack against RSA, where one uses many
                 execution-time measurements under the same key in order
                 to statistically amplify some small but key-dependent
                 timing differences, we dramatically improve upon the
                 former result. We prove that a carefully written
                 spy-process running simultaneously with an RSA-process,
                 is able to collect during one \emph{single} RSA signing
                 execution almost all of the secret key bits. We call
                 such an attack, analyzing the CPU's Branch Predictor
                 states through spying on a single quasi-parallel
                 computation process, a \emph{Simple Branch Prediction
                 Analysis (SBPA)} attack --- sharply differentiating it
                 from those one relying on statistical methods and
                 requiring many computation measurements under the same
                 key. The successful extraction of almost all secret key
                 bits by our SBPA attack against an openSSL RSA
                 implementation proves that the often recommended
                 blinding or so called randomization techniques to
                 protect RSA against side-channel attacks are, in the
                 context of SBPA attacks, totally useless. Additional to
                 that very crucial security implication, targeted at
                 such implementations which are assumed to be at least
                 statistically secure, our successful SBPA attack also
                 bears another equally critical security implication.
                 Namely, in the context of simple side-channel attacks,
                 it is widely believed that equally balancing the
                 operations after branches is a secure countermeasure
                 against such simple attacks. Unfortunately, this is not
                 true, as even such ``balanced branch'' implementations
                 can be completely broken by our SBPA attacks. Moreover,
                 despite sophisticated hardware-assisted partitioning
                 methods such as memory protection, sandboxing or even
                 virtualization, SBPA attacks empower an unprivileged
                 process to successfully attack other processes running
                 in parallel on the same processor. Thus, we conclude
                 that SBPA attacks are much more dangerous than
                 previously anticipated, as they obviously do not belong
                 to the same category as pure timing attacks.",
  acknowledgement = ack-nhfb,
  keywords =     "implementation / Branch Prediction; Modular
                 Exponentiation; RSA; Side Channel Analysis;
                 Simultaneous Multithreading; Trusted Computing",
}

@Article{Adl-Tabatabai:2006:CRS,
  author =       "Ali-Reza Adl-Tabatabai and Brian T. Lewis and Vijay
                 Menon and Brian R. Murphy and Bratin Saha and Tatiana
                 Shpeisman",
  title =        "Compiler and runtime support for efficient software
                 transactional memory",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "6",
  pages =        "26--37",
  month =        jun,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1133981.1133985",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:42:48 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Programmers have traditionally used locks to
                 synchronize concurrent access to shared data.
                 Lock-based synchronization, however, has well-known
                 pitfalls: using locks for fine-grain synchronization
                 and composing code that already uses locks are both
                 difficult and prone to deadlock. Transactional memory
                 provides an alternate concurrency control mechanism
                 that avoids these pitfalls and significantly eases
                 concurrent programming. Transactional memory language
                 constructs have recently been proposed as extensions to
                 existing languages or included in new concurrent
                 language specifications, opening the door for new
                 compiler optimizations that target the overheads of
                 transactional memory. This paper presents compiler and
                 runtime optimizations for transactional memory language
                 constructs. We present a high-performance software
                 transactional memory system (STM) integrated into a
                 managed runtime environment. Our system efficiently
                 implements nested transactions that support both
                 composition of transactions and partial roll back. Our
                 JIT compiler is the first to optimize the overheads of
                 STM, and we show novel techniques for enabling JIT
                 optimizations on STM operations. We measure the
                 performance of our optimizations on a 16-way SMP
                 running multi-threaded transactional workloads. Our
                 results show that these techniques enable transactional
                 memory's performance to compete with that of well-tuned
                 synchronization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "code generation; compiler optimizations; locking;
                 synchronization; transactional memory; virtual
                 machines",
}

@Article{Agerwala:2006:SRC,
  author =       "T. Agerwala and M. Gupta",
  title =        "Systems research challenges: a scale-out perspective",
  journal =      j-IBM-JRD,
  volume =       "50",
  number =       "2/3",
  pages =        "173--??",
  month =        mar # " \slash " # may,
  year =         "2006",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Fri Feb 9 20:16:31 MST 2007",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/502/agerwala.html",
  abstract =     "A scale-out system is a collection of interconnected,
                 modular, low- cost computers that work as a single
                 entity to cooperatively provide applications, systems
                 resources, and data to users. The dominant programming
                 model for such systems consists of message passing at
                 the systems level and multithreading at the element
                 level. Scale-out computers have traditionally been
                 developed and deployed to provide levels of performance
                 (throughput and parallel processing) beyond what was
                 achievable by large shared-memory computers that
                 utilized the fastest processors and the most expensive
                 memory systems. Today, exploiting scale-out at all
                 levels in systems is becoming imperative in order to
                 overcome a fundamental discontinuity in the development
                 of microprocessor technology caused by power
                 dissipation. The pervasive use of greater levels of
                 scale-out, on the other hand, creates its own
                 challenges in architecture, programming, systems
                 management, and reliability. This position paper
                 identifies some of the important research problems that
                 must be addressed in order to deal with the technology
                 disruption and fully realize the opportunity offered by
                 scale-out. Our examples are based on parallelism, but
                 the challenges we identify apply to scale-out more
                 generally.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0247-00",
}

@Article{Bacon:2006:BFL,
  author =       "D. F. Bacon and X. Shen",
  title =        "Braids and fibers: Language constructs with
                 architectural support for adaptive responses to memory
                 latencies",
  journal =      j-IBM-JRD,
  volume =       "50",
  number =       "2/3",
  pages =        "209--??",
  month =        mar # " \slash " # may,
  year =         "2006",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Fri Feb 9 20:16:31 MST 2007",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/502/bacon.html",
  abstract =     "As processor speeds continue to increase at a much
                 higher rate than memory speeds, memory latencies may
                 soon approach a thousand processor cycles. As a result,
                 the flat memory model that was made practical by deeply
                 pipelined superscalar processors with multilevel caches
                 will no longer be tenable. The most common approach to
                 this problem is multithreading; however, multithreading
                 requires either abundant independent applications or
                 well-parallelized monolithic applications, and neither
                 is easy to come by. We present high-level programming
                 constructs called braids and fibers. The programming
                 constructs facilitate the creation of programs that are
                 partially ordered, in which the partial orders can be
                 used to support adaptive responses to memory access
                 latencies. Braiding is simpler than parallelizing,
                 while yielding many of the same benefits. We show how
                 the programming constructs can be effectively supported
                 with simple instruction set architecture extensions and
                 microarchitectural enhancements. We have developed
                 braided versions of a number of important algorithms.
                 The braided code is easy to understand at the source
                 level and can be translated into highly efficient
                 instructions using our architecture extensions.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
  ordernumber =  "G322-0247-00",
}

@Article{Basile:2006:ARM,
  author =       "Claudio Basile and Zbigniew Kalbarczyk and Ravishankar
                 K. Iyer",
  title =        "Active Replication of Multithreaded Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "17",
  number =       "5",
  pages =        "448--465",
  month =        may,
  year =         "2006",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2006.56",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 14:26:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/comp/trans/td/2006/05/l0448s.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Blundell:2006:AGT,
  author =       "Colin Blundell and Dimitra Giannakopoulou and Corina
                 S. P{\u{a}}s{\u{a}}reanu",
  title =        "Assume-guarantee testing",
  journal =      j-SIGSOFT,
  volume =       "31",
  number =       "2",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2006",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/1108768.1123060",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:15:15 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib",
  abstract =     "Verification techniques for component-based systems
                 should ideally be able to predict properties of the
                 assembled system through analysis of individual
                 components before assembly. This work introduces such a
                 modular technique in the context of testing.
                 Assume-guarantee testing relies on the (automated)
                 decomposition of key system-level requirements into
                 local component requirements at design time. Developers
                 can verify the local requirements by checking
                 components in isolation; failed checks may indicate
                 violations of system requirements, while valid traces
                 from different components compose via the
                 assume-guarantee proof rule to potentially provide
                 system coverage. These local requirements also form the
                 foundation of a technique for efficient predictive
                 testing of assembled systems: given a correct system
                 run, this technique can predict violations by
                 alternative system runs without constructing those
                 runs. We discuss the application of our approach to
                 testing a multi-threaded NASA application, where we
                 treat threads as components.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Blundell:2006:STM,
  author =       "C. Blundell and E. C. Lewis and M. M. K. Martin",
  title =        "Subtleties of transactional memory atomicity
                 semantics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "17--17",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Transactional memory has great potential for
                 simplifying multithreaded programming by allowing
                 programmers to specify regions of the program that must
                 appear to execute atomically. Transactional memory
                 implementations then optimistically execute these
                 transactions concurrently to obtain high performance.
                 This work shows that the same atomic guarantees that
                 give transactions their power also have unexpected and
                 potentially serious negative effects on programs that
                 were written assuming narrower scopes of atomicity. We
                 make four contributions: (1) we show that a direct
                 translation of lock-based critical sections into
                 transactions can introduce deadlock into otherwise
                 correct programs, (2) we introduce the terms strong
                 atomicity and weak atomicity to describe the
                 interaction of transactional and non-transactional
                 code, (3) we show that code that is correct under weak
                 atomicity can deadlock under strong atomicity, and (4)
                 we demonstrate that sequentially composing
                 transactional code can also introduce deadlocks. These
                 observations invalidate the intuition that transactions
                 are strictly safer than lock-based critical sections,
                 that strong atomicity is strictly safer than weak
                 atomicity, and that transactions are always
                 composable",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer languages; Computer Systems Organization;
                 Concurrent distributed and parallel languages;
                 deadlock; direct translation; Hardware; Information
                 science; Interference; Interleaved codes; Language
                 Classifications; Law; lock-based critical sections;
                 Multi-core/single-chip multiprocessors;
                 multi-threading; Multiple Data Stream Architectures
                 (Multiprocessors); multithreaded programming;
                 nontransactional code; operating systems (computers);
                 Parallel Architectures; Processor Architectures;
                 program verification; Programming Languages;
                 Programming profession; sequentially composing
                 transactional code; Software performance;
                 Software/Software Engineering; strong atomicity; System
                 recovery; Transaction databases; transaction
                 processing; transactional memory atomicity semantics;
                 weak atomicity",
}

@Article{Bracy:2006:DAC,
  author =       "A. Bracy and K. Doshi and Q. Jacobson",
  title =        "Disintermediated Active Communication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "15--15",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Disintermediated active communication (DAC) is a new
                 paradigm of communication in which a sending thread
                 actively engages a receiving thread when sending it a
                 message via shared memory. DAC is different than
                 existing approaches that use passive communication
                 through shared-memory --- based on intermittently
                 checking for messages --- or that use preemptive
                 communication but must rely on intermediaries such as
                 the operating system or dedicated interrupt channels.
                 An implementation of DAC builds on existing cache
                 coherency support and exploits light-weight user-level
                 interrupts. Inter-thread communication occurs via
                 monitored memory locations where the receiver thread
                 responds to invalidations of monitored addresses with a
                 light-weight user-level software-defined handler.
                 Address monitoring is supported by cache line
                 user-bits, or CLUbits. CLUbits reside in the cache next
                 to the coherence state, are private per thread, and
                 maintain user-defined per-cache-line state. A light
                 weight software library can demultiplex asynchronous
                 notifications and handle exceptional cases. In
                 DAC-based programs threads coordinate with one another
                 by explicit signaling and implicit resource monitoring.
                 With the simple and direct communication primitives of
                 DAC, multi-threaded workloads synchronize at a finer
                 granularity and more efficiently utilize the hardware
                 of upcoming multi-core designs. This paper introduces
                 DAC, presents several signaling models for DAC-based
                 programs, and describes a simple memory-based framework
                 that supports DAC by leveraging existing
                 cache-coherency models. Our framework is general enough
                 to support uses beyond DAC",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address monitoring; cache coherency; cache line
                 user-bits; cache storage; CLUbits; Computer aided
                 instruction; Concurrent computing; disintermediated
                 active communication; Hardware; High performance
                 computing; interrupts; interthread communication;
                 memory locations; Monitoring; multi-threading;
                 multicore designs; Operating systems; Processor
                 scheduling; Programming profession; resource
                 monitoring; shared memory; shared memory systems;
                 signaling models; software libraries; Software
                 libraries; software library; storage allocation;
                 user-level interrupts",
}

@Article{Brzuszek:2006:MTS,
  author =       "Marcin Brzuszek and Andrzej Daniluk",
  title =        "Multithreaded transactions in scientific computing:
                 New versions of a computer program for kinematical
                 calculations of {RHEED} intensity oscillations",
  journal =      j-COMP-PHYS-COMM,
  volume =       "175",
  number =       "10",
  pages =        "678--681",
  day =          "15",
  month =        nov,
  year =         "2006",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2006.06.013",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:42:10 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465506002979",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Cerin:2006:MSS,
  author =       "Christophe C{\'e}rin and Jean-Luc Gaudiot and Michel
                 Koskas",
  title =        "A Multithreaded {SQL} Service",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "16",
  number =       "2",
  pages =        "245--259",
  month =        jun,
  year =         "2006",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626406002605",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Chakraborty:2006:CSE,
  author =       "Koushik Chakraborty and Philip M. Wells and Gurindar
                 S. Sohi",
  title =        "Computation spreading: employing hardware migration to
                 specialize {CMP} cores on-the-fly",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "283--292",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168919.1168893",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In canonical parallel processing, the operating system
                 (OS) assigns a processing core to a single thread from
                 a multithreaded server application. Since different
                 threads from the same application often carry out
                 similar computation, albeit at different times, we
                 observe extensive code reuse among different
                 processors, causing redundancy (e.g., in our server
                 workloads, 45-65\% of all instruction blocks are
                 accessed by all processors). Moreover, largely
                 independent fragments of computation compete for the
                 same private resources causing destructive
                 interference. Together, this redundancy and
                 interference lead to poor utilization of private
                 microarchitecture resources such as caches and branch
                 predictors. We present Computation Spreading (CSP),
                 which employs hardware migration to distribute a
                 thread's dissimilar fragments of computation across the
                 multiple processing cores of a chip multiprocessor
                 (CMP), while grouping similar computation fragments
                 from different threads together. This paper focuses on
                 a specific example of CSP for OS intensive server
                 applications: separating application level (user)
                 computation from the OS calls it makes. When performing
                 CSP, each core becomes temporally specialized to
                 execute certain computation fragments, and the same
                 core is repeatedly used for such fragments. We examine
                 two specific thread assignment policies for CSP, and
                 show that these policies, across four server workloads,
                 are able to reduce instruction misses in private L2
                 caches by 27-58\%, private L2 load misses by 0-19\%,
                 and branch mispredictions by 9-25\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cache locality; dynamic specialization",
}

@Article{Chuang:2006:UPB,
  author =       "Weihaw Chuang and Satish Narayanasamy and Ganesh
                 Venkatesh and Jack Sampson and Michael {Van Biesbrouck}
                 and Gilles Pokam and Brad Calder and Osvaldo Colavin",
  title =        "Unbounded page-based transactional memory",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "347--358",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168918.1168901",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Exploiting thread level parallelism is paramount in
                 the multicore era. Transactions enable programmers to
                 expose such parallelism by greatly simplifying the
                 multi-threaded programming model. Virtualized
                 transactions (unbounded in space and time) are
                 desirable, as they can increase the scope of
                 transactions' use, and thereby further simplify a
                 programmer's job. However, hardware support is
                 essential to support efficient execution of unbounded
                 transactions. In this paper, we introduce Page-based
                 Transactional Memory to support unbounded transactions.
                 We combine transaction bookkeeping with the virtual
                 memory system to support fast transaction conflict
                 detection, commit, abort, and to maintain transactions'
                 speculative data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; parallel programming; transactional
                 memory; transactions; virtual memory",
}

@Article{Ergin:2006:ENV,
  author =       "O. Ergin and O. Unsal and X. Vera and A. Gonzalez",
  title =        "Exploiting Narrow Values for Soft Error Tolerance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "12--12",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Soft errors are an important challenge in contemporary
                 microprocessors. Particle hits on the components of a
                 processor are expected to create an increasing number
                 of transient errors with each new microprocessor
                 generation. In this paper we propose simple mechanisms
                 that effectively reduce the vulnerability to soft
                 errors In a processor. Our designs are generally
                 motivated by the fact that many of the produced and
                 consumed values in the processors are narrow and their
                 upper order bits are meaningless. Soft errors canted by
                 any particle strike to these higher order bits can be
                 avoided by simply identifying these narrow values.
                 Alternatively soft errors can be detected or corrected
                 on the narrow values by replicating the vulnerable
                 portion of the value inside the storage space provided
                 for the upper order bits of these operands. We offer a
                 variety of schemes that make use of narrow values and
                 analyze their efficiency in reducing soft error
                 vulnerability of level-1 data cache of the processor",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Cache storage; contemporary
                 microprocessors; data cache; Data Cache; Error
                 correction; error correction; Error Correction; error
                 correction; error detection; Hardware; Impurities;
                 Manufacturing; microprocessor chips; Microprocessors;
                 Multithreading; Narrow Values; narrow values; Neutrons;
                 particle strike; Process design; radiation effects;
                 Random access memory; soft error tolerance; Soft
                 Errors; system recovery; transient errors; transients",
}

@Article{Factor:2006:PID,
  author =       "Michael Factor and Assaf Schuster and Konstantin
                 Shagin",
  title =        "A Platform-Independent Distributed Runtime for
                 Standard Multithreaded {Java}",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "34",
  number =       "2",
  pages =        "113--142",
  month =        apr,
  year =         "2006",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-006-0007-0",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:55 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=2&spage=113",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "bytecode instrumentation; distributed computing;
                 distributed shared memory; Java",
}

@Article{Gomez:2006:SCM,
  author =       "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam",
  title =        "Scheduling communication in multithreaded programs:
                 experimental results",
  journal =      j-CCPE,
  volume =       "18",
  number =       "1",
  pages =        "1--28",
  month =        jan,
  year =         "2006",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.904",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:00 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "13 Sep 2005",
}

@Article{Gomez:2006:STC,
  author =       "Juan Carlos Gomez and Jorge R. Ramos and Vernon Rego",
  title =        "Signals, timers, and continuations for multithreaded
                 user-level protocols",
  journal =      j-SPE,
  volume =       "36",
  number =       "5",
  pages =        "449--471",
  day =          "25",
  month =        apr,
  year =         "2006",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.700",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Wed Oct 17 18:33:12 MDT 2007",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "19 Jan 2006",
}

@Article{Grelck:2006:SFA,
  author =       "Clemens Grelck and Sven-Bodo Scholz",
  title =        "{SAC} --- a Functional Array Language for Efficient
                 Multi-threaded Execution",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "34",
  number =       "4",
  pages =        "383--427",
  month =        aug,
  year =         "2006",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-006-0018-x",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:07 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=4;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=4&spage=383",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Compiler optimisation; data parallel programming;
                 multi-threading; Single Assignment C",
}

@Article{Kaiser:2006:CJC,
  author =       "Claude Kaiser and Jean-Fran{\c{c}}ois Pradat-Peyre and
                 Sami {\'E}vangelista and Pierre Rousseau",
  title =        "Comparing {Java}, {C\#} and {Ada} monitors queuing
                 policies: a case study and its {Ada} refinement",
  journal =      j-SIGADA-LETTERS,
  volume =       "26",
  number =       "2",
  pages =        "23--37",
  month =        aug,
  year =         "2006",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/1165678.1165681",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Tue Jun 17 09:16:14 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Learning concurrency paradigms is necessary but it is
                 not sufficient since the choice of run-time semantics
                 may introduce subtle programming errors. It is the aim
                 of this paper to exemplify the importance of process
                 queuing and awaking policies resulting from possible
                 choices of the monitor concept implementation.The first
                 part of the paper compares the behaviour of concurrent
                 processes sharing a unique waiting queue for condition
                 synchronization when implemented in Java or in Ada. A
                 particular solution of the dining philosophers paradigm
                 will be used to show how the difference in the monitor
                 semantics may lead or not to deadlock. This comparison
                 provides insight for deriving a correct Java
                 implementation. The second part of the paper shows how
                 the implementation can be refined when using Ada entry
                 families and requeue with requeue once restriction. The
                 result is elegant, safe and fair, and deterministic.
                 This paper ends with quantitative comparisons of
                 concurrency complexity and of concurrency
                 effectiveness.We conclude that Java and C\#
                 multithreading need defensive concurrent programming
                 while Ada allows more latitude for developing correct
                 concurrent programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGAda Ada Letters",
}

@Article{Kim:2006:ERI,
  author =       "Seon Wook Kim and Chong-Liang Ooi and Rudolf Eigenmann
                 and Babak Falsafi and T. N. Vijaykumar",
  title =        "Exploiting reference idempotency to reduce speculative
                 storage overflow",
  journal =      j-TOPLAS,
  volume =       "28",
  number =       "5",
  pages =        "942--965",
  month =        sep,
  year =         "2006",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1152649.1152653",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Wed Sep 6 07:13:55 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Recent proposals for multithreaded architectures
                 employ speculative execution to allow threads with
                 unknown dependences to execute speculatively in
                 parallel. The architectures use hardware speculative
                 storage to buffer speculative data, track data
                 dependences and correct incorrect executions through
                 roll-backs. Because all memory references access the
                 speculative storage, current proposals implement
                 speculative storage using small memory structures to
                 achieve fast access. The limited capacity of the
                 speculative storage causes considerable performance
                 loss due to speculative storage overflow whenever a
                 thread's speculative state exceeds the speculative
                 storage capacity. Larger threads exacerbate the
                 overflow problem but are preferable to smaller threads,
                 as larger threads uncover more parallelism. In this
                 article, we discover a new program property called
                 memory reference idempotency. Idempotent references are
                 guaranteed to be eventually corrected, though the
                 references may be temporarily incorrect in the process
                 of speculation. Therefore, idempotent references, even
                 from nonparallelizable program sections, need not be
                 tracked in the speculative storage, and instead can
                 directly access nonspeculative storage (i.e.,
                 conventional memory hierarchy). Thus, we reduce the
                 demand for speculative storage space in large threads.
                 We define a formal framework for reference idempotency
                 and present a novel compiler-assisted speculative
                 execution model. We prove the necessary and sufficient
                 conditions for reference idempotency using our model.
                 We present a compiler algorithm to label idempotent
                 memory references for the hardware. Experimental
                 results show that for our benchmarks, over 60\% of the
                 references in nonparallelizable program sections are
                 idempotent.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Kyriacou:2006:CCO,
  author =       "Costas Kyriacou and Paraskevas Evripidou and Pedro
                 Trancoso",
  title =        "{CacheFlow}: Cache Optimizations for Data Driven
                 Multithreading",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "16",
  number =       "2",
  pages =        "229--244",
  month =        jun,
  year =         "2006",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626406002599",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Kyriacou:2006:DDM,
  author =       "Costas Kyriacou and Paraskevas Evripidou and Pedro
                 Trancoso",
  title =        "Data-Driven Multithreading Using Conventional
                 Microprocessors",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "17",
  number =       "10",
  pages =        "1176--1188",
  month =        oct,
  year =         "2006",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2006.136",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 14:26:50 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@TechReport{Lee:2006:PT,
  author =       "Edward A. Lee",
  title =        "The Problem with Threads",
  type =         "Technical Report",
  number =       "UCB/EECS-2006-1",
  institution =  "Electrical Engineering and Computer Sciences.
                 University of California at Berkeley",
  address =      "Berkeley, CA, USA",
  day =          "10",
  month =        jan,
  year =         "2006",
  bibdate =      "Thu Oct 23 15:07:59 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.eecs.berkeley.edu/Pubs/TechRpts/2006/EECS-2006-1.html",
  abstract =     "Threads are a seemingly straightforward adaptation of
                 the dominant sequential model of computation to
                 concurrent systems. Languages require little or no
                 syntactic changes to support threads, and operating
                 systems and architectures have evolved to efficiently
                 support them. Many technologists are pushing for
                 increased use of multithreading in software in order to
                 take advantage of the predicted increases in
                 parallelism in computer architectures. In this paper, I
                 argue that this is not a good idea. Although threads
                 seem to be a small step from sequential computation, in
                 fact, they represent a huge step. They discard the most
                 essential and appealing properties of sequential
                 computation: understandability, predictability, and
                 determinism. Threads, as a model of computation, are
                 wildly nondeterministic, and the job of the programmer
                 becomes one of pruning that nondeterminism. Although
                 many research techniques improve the model by offering
                 more effective pruning, I argue that this is
                 approaching the problem backwards. Rather than pruning
                 nondeterminism, we should build from essentially
                 deterministic, composable components. Nondeterminism
                 should be explicitly and judiciously introduced where
                 needed, rather than removed where not needed. The
                 consequences of this principle are profound. I argue
                 for the development of concurrent coordination
                 languages based on sound, composable formalisms. I
                 believe that such languages will yield much more
                 reliable, and more concurrent programs.",
  acknowledgement = ack-nhfb,
}

@Article{Lee:2006:TBR,
  author =       "S.-W. Lee and J.-L. Gaudiot",
  title =        "Throttling-Based Resource Management in High
                 Performance Multithreaded Architectures",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "55",
  number =       "9",
  pages =        "1142--1152",
  month =        sep,
  year =         "2006",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2006.154",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon Jul 4 15:35:56 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1668042",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Li:2006:MEMa,
  author =       "Xin Li and Marian Boldt and Reinhard von Hanxleden",
  title =        "Mapping {Esterel} onto a multi-threaded embedded
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "303--314",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Li:2006:MEMb,
  author =       "Xin Li and Marian Boldt and Reinhard von Hanxleden",
  title =        "Mapping {Esterel} onto a multi-threaded embedded
                 processor",
  journal =      j-OPER-SYS-REV,
  volume =       "40",
  number =       "5",
  pages =        "303--314",
  month =        dec,
  year =         "2006",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Li:2006:MEMc,
  author =       "Xin Li and Marian Boldt and Reinhard von Hanxleden",
  title =        "Mapping {Esterel} onto a multi-threaded embedded
                 processor",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "303--314",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168857.1168896",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The synchronous language Esterel is well-suited for
                 programming control-dominated reactive systems at the
                 system level. It provides non-traditional control
                 structures, in particular concurrency and various forms
                 of preemption, which allow to concisely express
                 reactive behavior. As these control structures cannot
                 be mapped easily onto traditional, sequential
                 processors, an alternative approach that has emerged
                 recently makes use of special-purpose reactive
                 processors. However, the designs proposed so far have
                 limitations regarding completeness of the language
                 support, and did not really take advantage of
                 compile-time knowledge to optimize resource usage. This
                 paper presents a reactive processor, the Kiel Esterel
                 Processor 3a (KEP3a), and its compiler. The KEP3a
                 improves on earlier designs in several areas; most
                 notable are the support for exception handling and the
                 provision of context-dependent preemption handling
                 instructions. The KEP3a compiler presented here is to
                 our knowledge the first for multi-threaded reactive
                 processors. The translation of Esterel's preemption
                 constructs onto KEP3a assembler is straightforward;
                 however, a challenge is the correct and efficient
                 representation of Esterel's concurrency. The compiler
                 generates code that respects data and control
                 dependencies using the KEP3a priority-based scheduling
                 mechanism. We present a priority assignment approach
                 that makes use of a novel concurrent control flow graph
                 and has a complexity that in practice tends to be
                 linear in the size of the program. Unlike earlier
                 Esterel compilation schemes, this approach avoids
                 unnecessary context switches by considering each
                 thread's actual execution state at run time.
                 Furthermore, it avoids code replication present in
                 other approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; Esterel; low-power processing;
                 multi-threading; reactive systems",
}

@Article{Li:2006:SDH,
  author =       "Tong Li and Alvin R. Lebeck and Daniel J. Sorin",
  title =        "Spin Detection Hardware for Improved Management of
                 Multithreaded Systems",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "17",
  number =       "6",
  pages =        "508--521",
  month =        jun,
  year =         "2006",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2006.78",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 14:26:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Moon:2006:TMS,
  author =       "Sewon Moon and Byeong-Mo Chang",
  title =        "A thread monitoring system for multithreaded {Java}
                 programs",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "5",
  pages =        "21--29",
  month =        may,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1149982.1149985",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:42:34 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To assist developing robust multithreaded software, we
                 develop a thread monitoring system for multithreaded
                 Java programs, which can trace or monitor running
                 threads and synchronization. We design a monitoring
                 system which has options to select interesting threads
                 and synchronized actions. Using this tool, programmers
                 can monitor only interesting threads and
                 synchronization in more details by selecting options,
                 and can detect a deadlock. It also provides profile
                 information after execution, which summarizes behavior
                 of running threads and synchronized actions during
                 execution. We implement the system based on code
                 inlining, and presents some experimental results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Java; monitoring; synchronization; thread",
}

@Article{Morad:2006:PPE,
  author =       "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M.
                 Valero and E. Ayguade",
  title =        "Performance, power efficiency and scalability of
                 asymmetric cluster chip multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "14--17",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper evaluates asymmetric cluster chip
                 multiprocessor (ACCMP) architectures as a mechanism to
                 achieve the highest performance for a given power
                 budget. ACCMPs execute serial phases of multithreaded
                 programs on large high-performance cores whereas
                 parallel phases are executed on a mix of large and many
                 small simple cores. Theoretical analysis reveals a
                 performance upper bound for symmetric multiprocessors,
                 which is surpassed by asymmetric configurations at
                 certain power ranges. Our emulations show that
                 asymmetric multiprocessors can reduce power consumption
                 by more than two thirds with similar performance
                 compared to symmetric multiprocessors",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACCMP; Application software; asymmetric cluster chip
                 multiprocessors; Chip Multiprocessors; Emulation;
                 Frequency; microprocessor chips; multi-threading;
                 multiprocessing systems; multithreaded program;
                 Optimized production technology; Parallel processing;
                 parallel processing; power consumption reduction; power
                 efficiency; Power Efficiency; Power system modeling;
                 Queueing analysis; Scalability; Upper bound; Voltage",
}

@Article{Naik:2006:ESR,
  author =       "Mayur Naik and Alex Aiken and John Whaley",
  title =        "Effective static race detection for {Java}",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "6",
  pages =        "308--319",
  month =        jun,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1133255.1134018",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:42:48 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present a novel technique for static race detection
                 in Java programs, comprised of a series of stages that
                 employ a combination of static analyses to successively
                 reduce the pairs of memory accesses potentially
                 involved in a race. We have implemented our technique
                 and applied it to a suite of multi-threaded Java
                 programs. Our experiments show that it is precise,
                 scalable, and useful, reporting tens to hundreds of
                 serious and previously unknown concurrency bugs in
                 large, widely-used programs with few false alarms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; Java; multi-threading; static race
                 detection; synchronization",
}

@Article{Nanda:2006:ISM,
  author =       "Mangala Gowri Nanda and S. Ramesh",
  title =        "Interprocedural slicing of multithreaded programs with
                 applications to {Java}",
  journal =      j-TOPLAS,
  volume =       "28",
  number =       "6",
  pages =        "1088--1144",
  month =        nov,
  year =         "2006",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1186632.1186636",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Sat Apr 14 11:13:21 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Narayanasamy:2006:RSM,
  author =       "Satish Narayanasamy and Cristiano Pereira and Brad
                 Calder",
  title =        "Recording shared memory dependencies using strata",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "229--240",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168857.1168886",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Significant time is spent by companies trying to
                 reproduce and fix bugs. BugNet and FDR are recent
                 architecture proposals that provide architecture
                 support for deterministic replay debugging. They focus
                 on continuously recording information about the
                 program's execution, which can be communicated back to
                 the developer. Using that information, the developer
                 can deterministically replay the program's execution to
                 reproduce and fix the bugs. In this paper, we propose
                 using Strata to efficiently capture the shared memory
                 dependencies. A stratum creates a time layer across all
                 the logs for the running threads, which separates all
                 the memory operations executed before and after the
                 stratum. A strata log allows us to determine all the
                 shared memory dependencies during replay and thereby
                 supports deterministic replay debugging for
                 multi-threaded programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "debugging; dependencies; logging; replay; shared
                 memory; strata",
}

@Article{Ottoni:2006:SPC,
  author =       "G. Ottoni and R. Rangan and A. Stoler and M. J.
                 Bridges and D. I. August",
  title =        "From sequential programs to concurrent threads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "6--9",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Chip multiprocessors are of increasing importance due
                 to difficulties in achieving higher clock frequencies
                 in uniprocessors, but their success depends on finding
                 useful work for the processor cores. This paper
                 addresses this challenge by presenting a simple
                 compiler approach that extracts non-speculative
                 thread-level parallelism from sequential codes. We
                 present initial results from this technique targeting a
                 validated dual-core processor model, achieving speedups
                 ranging from 9-48\% with an average of 25\% for
                 important benchmark loops over their single-threaded
                 versions. We also identify important next steps found
                 during our pursuit of higher degrees of automatic
                 threading",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "automatic threading; Bridges; Clocks; Computer
                 science; concurrency control; concurrent threads;
                 Frequency; Hardware; Microprocessors; multi-threading;
                 nonspeculative thread-level parallelism; Parallel
                 processing; Pipeline processing; program compiler;
                 program compilers; Program processors; sequential
                 programs",
}

@Article{Parashar:2006:SSBa,
  author =       "Angshuman Parashar and Anand Sivasubramaniam and
                 Sudhanva Gurumurthi",
  title =        "{SlicK}: slice-based locality exploitation for
                 efficient redundant multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "95--105",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Parashar:2006:SSBb,
  author =       "Angshuman Parashar and Anand Sivasubramaniam and
                 Sudhanva Gurumurthi",
  title =        "{SlicK}: slice-based locality exploitation for
                 efficient redundant multithreading",
  journal =      j-OPER-SYS-REV,
  volume =       "40",
  number =       "5",
  pages =        "95--105",
  month =        dec,
  year =         "2006",
  CODEN =        "OSRED8",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Parashar:2006:SSBc,
  author =       "Angshuman Parashar and Anand Sivasubramaniam and
                 Sudhanva Gurumurthi",
  title =        "{SlicK}: slice-based locality exploitation for
                 efficient redundant multithreading",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "95--105",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168857.1168870",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Transient faults are expected a be a major design
                 consideration in future microprocessors. Recent
                 proposals for transient fault detection in processor
                 cores have revolved around the idea of redundant
                 threading, which involves redundant execution of a
                 program across multiple execution contexts. This paper
                 presents a new approach to redundant threading by
                 bringing together the concepts of slice-level execution
                 and value and control-flow locality into a novel
                 partial redundant threading mechanism called SlicK .The
                 purpose of redundant execution is to check the
                 integrity of the outputs propagating out of the core
                 (typically through stores). SlicK implements redundancy
                 at the granularity of backward-slices of these output
                 instructions and exploits value and control-flow
                 locality to avoid redundantly executing slices that
                 lead to predictable outputs, thereby avoiding redundant
                 execution of a significant fraction of instructions
                 while maintaining extremely low vulnerabilities for
                 critical processor structures. We propose the
                 microarchitecture of a backward-slice extractor called
                 SliceEM that is able to identify backward slices
                 without interrupting the instruction flow, and show how
                 this extractor and a set of predictors can be
                 integrated into a redundant threading mechanism to form
                 SlicK. Detailed simulations with SPEC CPU2000
                 benchmarks show that SlicK can provide around 10.2\%
                 performance improvement over a well known redundant
                 threading mechanism, buying back over 50\% of the loss
                 suffered due to redundant execution. SlicK can keep the
                 Architectural Vulnerability Factors of processor
                 structures to typically 0\%-2\%. More importantly,
                 SlicK's slice-based mechanisms provide future
                 opportunities for exploring interesting points in the
                 performance-reliability design space based on market
                 segment needs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "backward slice extraction; microarchitecture;
                 redundant threading; transient faults",
}

@Article{Pickett:2006:SSF,
  author =       "Christopher J. F. Pickett and Clark Verbrugge",
  title =        "{SableSpMT}: a software framework for analysing
                 speculative multithreading in {Java}",
  journal =      j-SIGSOFT,
  volume =       "31",
  number =       "1",
  pages =        "59--66",
  month =        jan,
  year =         "2006",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/1108768.1108809",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:15:12 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib",
  abstract =     "Speculative multithreading (SpMT) is a promising
                 optimisation technique for achieving faster execution
                 of sequential programs on multiprocessor hardware.
                 Analysis of and data acquisition from such systems is
                 however difficult and complex, and is typically limited
                 to a specific hardware design and simulation
                 environment. We have implemented a flexible,
                 software-based speculative multithreading architecture
                 within the context of a full-featured Java virtual
                 machine. We consider the entire Java language and
                 provide a complete set of support features for
                 speculative execution, including return value
                 prediction. Using our system we are able to generate
                 extensive dynamic analysis information, analyse the
                 effects of runtime feedback, and determine the impact
                 of incorporating static, offline information. Our
                 approach allows for accurate analysis of Java SpMT on
                 existing, commodity multiprocessor hardware, and
                 provides a vehicle for further experimentation with
                 speculative approaches and optimisations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Pratikakis:2006:LCS,
  author =       "Polyvios Pratikakis and Jeffrey S. Foster and Michael
                 Hicks",
  title =        "{LOCKSMITH}: context-sensitive correlation analysis
                 for race detection",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "6",
  pages =        "320--331",
  month =        jun,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1133255.1134019",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:42:48 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "One common technique for preventing data races in
                 multi-threaded programs is to ensure that all accesses
                 to shared locations are consistently protected by a
                 lock. We present a tool called LOCKSMITH for detecting
                 data races in C programs by looking for violations of
                 this pattern. We call the relationship between locks
                 and the locations they protect consistent correlation,
                 and the core of our technique is a novel
                 constraint-based analysis that infers consistent
                 correlation context-sensitively, using the results to
                 check that locations are properly guarded by locks. We
                 present the core of our algorithm for a simple formal
                 language \lambda$_>$ which we have proven sound, and
                 discuss how we scale it up to an algorithm that aims to
                 be sound for all of C. We develop several techniques to
                 improve the precision and performance of the analysis,
                 including a sharing analysis for inferring thread
                 locality; existential quantification for modeling locks
                 in data structures; and heuristics for modeling unsafe
                 features of C such as type casts. When applied to
                 several benchmarks, including multi-threaded servers
                 and Linux device drivers, LOCKSMITH found several races
                 while producing a modest number of false alarm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "context-sensitivity; correlation; locksmith;
                 multi-threaded programming; race detection; type
                 inference",
}

@Article{Reddy:2006:UPB,
  author =       "Vimal K. Reddy and Eric Rotenberg and Sailashri
                 Parthasarathy",
  title =        "Understanding prediction-based partial redundant
                 threading for low-overhead, high- coverage fault
                 tolerance",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "83--94",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168917.1168869",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Redundant threading architectures duplicate all
                 instructions to detect and possibly recover from
                 transient faults. Several lighter weight Partial
                 Redundant Threading (PRT) architectures have been
                 proposed recently. (i) Opportunistic Fault Tolerance
                 duplicates instructions only during periods of poor
                 single-thread performance. (ii) ReStore does not
                 explicitly duplicate instructions and instead exploits
                 mispredictions among highly confident branch
                 predictions as symptoms of faults. (iii) Slipstream
                 creates a reduced alternate thread by replacing many
                 instructions with highly confident predictions. We
                 explore PRT as a possible direction for achieving the
                 fault tolerance of full duplication with the
                 performance of single-thread execution. Opportunistic
                 and ReStore yield partial coverage since they are
                 restricted to using only partial duplication or only
                 confident predictions, respectively. Previous analysis
                 of Slipstream fault tolerance was cursory and concluded
                 that only duplicated instructions are covered. In this
                 paper, we attempt to better understand Slipstream's
                 fault tolerance, conjecturing that the mixture of
                 partial duplication and confident predictions actually
                 closely approximates the coverage of full duplication.
                 A thorough dissection of prediction scenarios confirms
                 that faults in nearly 100\% of instructions are
                 detectable. Fewer than 0.1\% of faulty instructions are
                 not detectable due to coincident faults and
                 mispredictions. Next we show that the current recovery
                 implementation fails to leverage excellent detection
                 capability, since recovery sometimes initiates
                 belatedly, after already retiring a detected faulty
                 instruction. We propose and evaluate a suite of simple
                 microarchitectural alterations to recovery and
                 checking. Using the best alterations, Slipstream can
                 recover from faults in 99\% of instructions, compared
                 to only 78\% of instructions without alterations. Both
                 results are much higher than predicted by past
                 research, which claims coverage for only duplicated
                 instructions, or 65\% of instructions. On an 8-issue
                 SMT processor, Slipstream performs within 1.3\% of
                 single-thread execution whereas full duplication slows
                 performance by 14\%. A key byproduct of this paper is a
                 novel analysis framework in which every dynamic
                 instruction is considered to be hypothetically faulty,
                 thus not requiring explicit fault injection. Fault
                 coverage is measured in terms of the fraction of
                 candidate faulty instructions that are directly or
                 indirectly detectable before.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "branch prediction; chip multiprocessor (CMP);
                 redundant multithreading; simultaneous multithreading
                 (SMT); slipstream processor; time redundancy; transient
                 faults; value prediction",
}

@Article{Ro:2006:DEH,
  author =       "Won W. Ro and Stephen P. Crago and Alvin M. Despain
                 and Jean-Luc Gaudiot",
  title =        "Design and evaluation of a hierarchical decoupled
                 architecture",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "38",
  number =       "3",
  pages =        "237--259",
  month =        dec,
  year =         "2006",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-006-8321-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:29 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=38&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=38&issue=3&spage=237",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Data prefetching; Decoupled architectures; Instruction
                 level parallelism; Memory latency hiding;
                 Multithreading; Parallel architecture; Speculative
                 execution",
}

@Article{Russell:2006:ESRa,
  author =       "Kenneth Russell and David Detlefs",
  title =        "Eliminating synchronization-related atomic operations
                 with biased locking and bulk rebiasing",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "10",
  pages =        "263--272",
  month =        oct,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1167515.1167496",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:47:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The Java{\TM} programming language contains built-in
                 synchronization primitives for use in constructing
                 multithreaded programs. Efficient implementation of
                 these synchronization primitives is necessary in order
                 to achieve high performance. Recent research [9, 12,
                 10, 3, 7] has focused on the run-time elimination of
                 the atomic operations required to implement object
                 monitor synchronization primitives. This paper
                 describes a novel technique called store-free biased
                 locking which eliminates all synchronization-related
                 atomic operations on uncontended object monitors. The
                 technique supports the bulk transfer of object
                 ownership from one thread to another, and the selective
                 disabling of the optimization where unprofitable, using
                 epoch-based bulk rebiasing and revocation. It has been
                 implemented in the production version of the Java
                 HotSpot{\TM}VM and has yielded significant performance
                 improvements on a range of benchmarks and applications.
                 The technique is applicable to any virtual
                 machine-based programming language implementation with
                 mostly block-structured locking primitives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "atomic; bias; Java; lock; monitor; optimization;
                 rebias; reservation; revoke; synchronization",
}

@Article{Sen:2006:OEP,
  author =       "Koushik Sen and Grigore Rosu and Gul Agha",
  title =        "Online efficient predictive safety analysis of
                 multithreaded programs",
  journal =      j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER,
  volume =       "8",
  number =       "3",
  pages =        "248--260",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10009-005-0192-y",
  ISSN =         "1433-2779 (print), 1433-2787 (electronic)",
  ISSN-L =       "1433-2779",
  bibdate =      "Wed Jul 9 18:12:21 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1433-2779&volume=8&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=1433-2779&volume=8&issue=3&spage=248",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal on Software Tools for Technology
                 Transfer: STTT",
  keywords =     "JMPaX; Multithreaded analysis; Predictive analysis;
                 Runtime monitoring; Vector clock",
}

@Article{Shin:2006:ADT,
  author =       "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot",
  title =        "Adaptive dynamic thread scheduling for simultaneous
                 multithreaded architectures with a detector thread",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "66",
  number =       "10",
  pages =        "1304--1321",
  month =        oct,
  year =         "2006",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Jul 11 20:32:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Smith:2006:ITP,
  author =       "Geoffrey Smith",
  title =        "Improved typings for probabilistic noninterference in
                 a multi-threaded language",
  journal =      j-J-COMP-SECUR,
  volume =       "14",
  number =       "6",
  pages =        "591--623",
  month =        "????",
  year =         "2006",
  CODEN =        "JCSIET",
  DOI =          "https://doi.org/10.3233/JCS-2006-14605",
  ISSN =         "0926-227X (print), 1875-8924 (electronic)",
  ISSN-L =       "0926-227X",
  bibdate =      "Tue May 24 06:23:23 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computer Security",
  journal-URL =  "http://content.iospress.com/journals/journal-of-computer-security",
}

@Article{Trancoso:2006:CCM,
  author =       "Pedro Trancoso and Paraskevas Evripidou and Kyriakos
                 Stavrou and Costas Kyriacou",
  title =        "A Case for Chip Multiprocessors Based on the
                 Data-Driven Multithreading Model",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "34",
  number =       "3",
  pages =        "213--235",
  month =        jun,
  year =         "2006",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-006-0016-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:05:59 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=3&spage=213",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Chip multiprocessor; data-driven execution;
                 multithreading; parallel processing",
}

@Article{Vasconcelos:2006:TCM,
  author =       "Vasco T. Vasconcelos and Simon J. Gay and Ant{\'o}nio
                 Ravara",
  title =        "Type checking a multithreaded functional language with
                 session types",
  journal =      j-THEOR-COMP-SCI,
  volume =       "368",
  number =       "1--2",
  pages =        "64--87",
  day =          "5",
  month =        dec,
  year =         "2006",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Tue Mar 29 08:55:29 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/03043975",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@Article{Wang:2006:RAA,
  author =       "L. Wang and S. D. Stoller",
  title =        "Runtime analysis of atomicity for multithreaded
                 programs",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "32",
  number =       "2",
  pages =        "93--110",
  month =        feb,
  year =         "2006",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2006.1599419",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Feb 1 11:00:42 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1599419",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Article{Xu:2006:RTR,
  author =       "Min Xu and Mark D. Hill and Rastislav Bodik",
  title =        "A regulated transitive reduction {(RTR)} for longer
                 memory race recording",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "11",
  pages =        "49--60",
  month =        nov,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1168919.1168865",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:49:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreaded deterministic replay has important
                 applications in cyclic debugging, fault tolerance and
                 intrusion analysis. Memory race recording is a key
                 technology for multithreaded deterministic replay. In
                 this paper, we considerably improve our previous
                 always-on Flight Data Recorder (FDR) in four ways:\par
                 \begin{itemize} \item Longer recording by reducing the
                 log size growth rate to approximately one byte per
                 thousand dynamic instructions. \item Lower hardware
                 cost by reducing the cost to 24 KB per processor core.
                 \item Simpler design by modifying only the cache
                 coherence protocol, but not the cache. \item Broader
                 applicability by supporting both Sequential Consistency
                 (SC) and Total Store Order (TSO) memory consistency
                 models (existing recorders support only SC).
                 \end{itemize} These improvements stem from several
                 ideas: (1) a Regulated Transitive Reduction (RTR)
                 recording algorithm that creates stricter and
                 vectorizable dependencies to reduce the log growth
                 rate; (2) a Set/LRU timestamp approximation method that
                 better approximates timestamps of uncached memory
                 locations to reduce the hardware cost; (3) an
                 order-value-hybrid recording method that explicitly
                 logs the value of potential SC-violating load
                 instructions to support multiprocessor systems with
                 TSO.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "determinism; multithreading; race recording",
}

@Article{Ziarek:2006:SMC,
  author =       "Lukasz Ziarek and Philip Schatz and Suresh
                 Jagannathan",
  title =        "Stabilizers: a modular checkpointing abstraction for
                 concurrent functional programs",
  journal =      j-SIGPLAN,
  volume =       "41",
  number =       "9",
  pages =        "136--147",
  month =        sep,
  year =         "2006",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1160074.1159822",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:46:22 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Transient faults that arise in large-scale software
                 systems can often be repaired by re-executing the code
                 in which they occur. Ascribing a meaningful semantics
                 for safe re-execution in multi-threaded code is not
                 obvious, however. For a thread to correctly re-execute
                 a region of code, it must ensure that all other threads
                 which have witnessed its unwanted effects within that
                 region are also reverted to a meaningful earlier state.
                 If not done properly, data inconsistencies and other
                 undesirable behavior may result. however, automatically
                 determining what constitutes a consistent global
                 checkpoint is not straightforward since thread
                 interactions are a dynamic property of the program. In
                 this paper, we present a safe and efficient
                 checkpointing mechanism for Concurrent ML (CML) that
                 can be used to recover from transient faults. We
                 introduce a new linguistic abstraction called
                 stabilizers that permits the specification of
                 per-thread monitors and the restoration of globally
                 consistent checkpoints. Safe global states are computed
                 through lightweight monitoring of communication events
                 among threads (e.g. message-passing operations or
                 updates to shared variables). Our experimental results
                 on several realistic, multithreaded, server-style CML
                 applications, including a web server and a windowing
                 toolkit, show that the overheads to use stabilizers are
                 small, and lead us to conclude that they are a viable
                 mechanism for defining safe checkpoints in concurrent
                 functional programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "checkpointing; concurrent ML; concurrent programming;
                 error recovery; exception handling; transactions",
}

@Article{Benaya:2007:UTA,
  author =       "Tamar Benaya and Ela Zur",
  title =        "Understanding threads in an advanced {Java} course",
  journal =      j-SIGCSE,
  volume =       "39",
  number =       "3",
  pages =        "323--323",
  month =        sep,
  year =         "2007",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/1269900.1268890",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 16:57:36 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  note =         "Proceedings of the 12th Annual SIGCSE Conference on
                 Innovation and Technology in Computer Science Education
                 (ITiCSE'07).",
  abstract =     "This poster describes difficulties in understanding
                 threads in an Advanced Java course given at the
                 Computer Science department of the Open University of
                 Israel (OUI). We present a typical question which
                 focuses on several aspects of multi-threaded
                 programming given in an exam. We discuss the students'
                 answers and point to typical misunderstandings of the
                 topic.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Benner:2007:SLS,
  author =       "Peter Benner and Maribel Castillo and Rafael Mayo and
                 Enrique S. Quintana-Ort{\'\i} and Gregorio
                 Quintana-Ort{\'\i}",
  title =        "Stabilizing large-scale generalized systems on
                 parallel computers using multithreading and
                 message-passing",
  journal =      j-CCPE,
  volume =       "19",
  number =       "4",
  pages =        "531--542",
  day =          "25",
  month =        mar,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1148",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:11 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "12 Dec 2006",
}

@Article{Bergstra:2007:SCE,
  author =       "J. A. Bergstra and C. A. Middelburg",
  title =        "Synchronous cooperation for explicit multi-threading",
  journal =      j-ACTA-INFO,
  volume =       "44",
  number =       "7--8",
  pages =        "525--569",
  month =        dec,
  year =         "2007",
  CODEN =        "AINFA2",
  DOI =          "https://doi.org/10.1007/s00236-007-0057-9",
  ISSN =         "0001-5903 (print), 1432-0525 (electronic)",
  ISSN-L =       "0001-5903",
  bibdate =      "Wed Jul 9 21:28:19 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0001-5903&volume=44&issue=7;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0001-5903&volume=44&issue=7&spage=525",
  acknowledgement = ack-nhfb,
  fjournal =     "Acta Informatica",
  journal-URL =  "http://www.springerlink.com/content/0001-5903",
}

@Article{Blundell:2007:MFC,
  author =       "Colin Blundell and Joe Devietti and E. Christopher
                 Lewis and Milo M. K. Martin",
  title =        "Making the fast case common and the uncommon case
                 simple in unbounded transactional memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "24--34",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250667",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Hardware transactional memory has great potential to
                 simplify the creation of correct and efficient
                 multithreaded programs, allowing programmers to exploit
                 more effectively the soon-to-be-ubiquitous multi-core
                 designs. Several recent proposals have extended the
                 original bounded transactional memory to unbounded
                 transactional memory, a crucial step toward
                 transactions becoming a general-purpose primitive.
                 Unfortunately, supporting the concurrent execution of
                 an unbounded number of unbounded transactions is
                 challenging, and as a result, many proposed
                 implementations are complex.\par

                 This paper explores a different approach. First, we
                 introduce the permissions-only cache to extend the
                 bound at which transactions overflow to allow the fast,
                 bounded case to be used as frequently as possible.
                 Second, we propose OneTM to simplify the implementation
                 of unbounded transactional memory by bounding the
                 concurrency of transactions that overflow the cache.
                 These mechanisms work synergistically to provide a
                 simple and fast unbounded transactional memory
                 system.\par

                 The permissions-only cache efficiently maintains the
                 coherence permissions --- but not data-for blocks read
                 or written transactionally that have been evicted from
                 the processor's caches. By holding coherence
                 permissions for these blocks, the regular cache
                 coherence protocol can be used to detect transactional
                 conflicts using only a few bits of on-chip storage per
                 overflowed cache block. OneTM allows only one
                 overflowed transaction at a time, relying on the
                 permissions-only cache to ensure that overflow is
                 infrequent. We present two implementations. In
                 OneTM-Serialized, an overflowed transaction simply
                 stalls all other threads in the application.\par

                 In OneTM-Concurrent, non-overflowed transactions and
                 non-transactional code can execute concurrently with
                 the overflowed transaction, providing more concurrency
                 while retaining OneTM's core simplifying assumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "concurrency; parallel programming; transactional
                 memory; transactions",
}

@Article{Boehm:2007:MCC,
  author =       "Hans Boehm and Bill Pugh and Doug Lea",
  title =        "Multithreading in {C} and {C++}",
  journal =      j-LOGIN,
  volume =       "32",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "LOGNEM",
  ISSN =         "1044-6397",
  ISSN-L =       "1044-6397",
  bibdate =      "Fri Dec 7 11:34:27 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib;
                 https://www.usenix.org/publications/login",
  URL =          "https://www.usenix.org/publications/login/february-2007-volume-32-number-1/multithreading-c-and-c",
  acknowledgement = ack-nhfb,
  fjournal =     ";login: the USENIX Association newsletter",
}

@Article{Burckhardt:2007:CCC,
  author =       "Sebastian Burckhardt and Rajeev Alur and Milo M. K.
                 Martin",
  title =        "{CheckFence}: checking consistency of concurrent data
                 types on relaxed memory models",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "12--21",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1250734.1250737",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Concurrency libraries can facilitate the development
                 of multi-threaded programs by providing concurrent
                 implementations of familiar data types such as queues
                 or sets. There exist many optimized algorithms that can
                 achieve superior performance on multiprocessors by
                 allowing concurrent data accesses without using locks.
                 Unfortunately, such algorithms can harbor subtle
                 concurrency bugs. Moreover, they require memory
                 ordering fences to function correctly on relaxed memory
                 models.\par

                 To address these difficulties, we propose a
                 verification approach that can exhaustively check all
                 concurrent executions of a given test program on a
                 relaxed memory model and can verify that they are
                 observationally equivalent to a sequential execution.
                 Our CheckFence prototype automatically translates the C
                 implementation code and the test program into a SAT
                 formula, hands the latter to a standard SAT solver, and
                 constructs counter example traces if there exist
                 incorrect executions. Applying CheckFence to five
                 previously published algorithms, we were able to (1)
                 find several bugs (some not previously known), and (2)
                 determine how to place memory ordering fences for
                 relaxed memory models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrent data structures; lock-free synchronization;
                 memory models; multi-threading; sequential consistency;
                 shared-memory multiprocessors; software model
                 checking",
}

@Article{Das:2007:FVT,
  author =       "Dipankar Das and P. P. Chakrabarti and Rajeev Kumar",
  title =        "Functional verification of task partitioning for
                 multiprocessor embedded systems",
  journal =      j-TODAES,
  volume =       "12",
  number =       "4",
  pages =        "44:1--44:??",
  month =        sep,
  year =         "2007",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1278349.1278357",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Thu Jun 12 18:09:35 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the advent of multiprocessor embedded platforms,
                 application partitioning and mapping have gained
                 primacy as a design step. The output of this design
                 step is a multithreaded partitioned application where
                 each thread is mapped to a processing element
                 (processor or ASIC) in the multiprocessor platform.
                 This partitioned application must be verified to be
                 consistent with the native unpartitioned application.
                 This verification task is called application (or task)
                 partitioning verification. \par

                 This work proposes a code-block-level
                 containment-checking -based methodology for application
                 partitioning verification. We use a UML-based
                 code-block-level modeling language which is rich enough
                 to model most designs. We formulate the application
                 partitioning verification problem as a special case of
                 the containment checking problem, which we call the
                 complete containment checking problem. We propose a
                 state space reduction technique specific to the
                 containment checking, reachability analysis, and
                 deadlock detection problems. We propose novel data
                 structures and token propagation methodologies which
                 enhance the efficiency of containment checking. We
                 present an efficient containment checking algorithm for
                 the application partitioning verification problem. We
                 develop a containment checking tool called TraceMatch
                 and present experimental results. We present a
                 comparison of the state space reduction achieved by
                 TraceMatch with that achieved by formal analysis and
                 verification tools like Spin, PEP, PROD, and LoLA.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems (TODAES)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
  keywords =     "Containment checking; multiprocessor embedded systems;
                 state space reduction; UML activity diagrams",
}

@Article{Dou:2007:CCM,
  author =       "Jialin Dou and Marcelo Cintra",
  title =        "A compiler cost model for speculative
                 parallelization",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1250727.1250732",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Speculative parallelization is a technique that allows
                 code sections that cannot be fully analyzed by the
                 compiler to be aggressively executed in parallel.
                 However, while speculative parallelization can
                 potentially deliver significant speedups, several
                 overheads associated with this technique can limit
                 these speedups in practice. This paper proposes a novel
                 compiler static cost model of speculative multithreaded
                 execution that can be used to predict the resulting
                 performance. This model attempts to predict the
                 expected speedups, or slowdowns, of the candidate
                 speculative sections based on the estimation of the
                 combined runtime effects of various overheads, and
                 taking into account the scheduling restrictions of most
                 speculative execution environments. The model is based
                 on estimating the likely execution duration of threads
                 and considers all the possible permutations of these
                 threads. This model also produces a quantitative
                 estimate of the speedup, which is different from prior
                 heuristics that only qualitatively estimate the
                 benefits of speculative multithreaded execution. In
                 previous work, a limited version of the framework was
                 evaluated on a number of loops from a collection of
                 SPEC benchmarks that suffer mainly from load imbalance
                 and thread dispatch and commit overheads. In this work,
                 an extended framework is also evaluated on loops that
                 may suffer from data-dependence violations.
                 Experimental results show that prediction accuracy is
                 lower when loops with violations are included.
                 Nevertheless, accuracy is still very high for a static
                 model: the framework can identify, on average, 45\% of
                 the loops that cause slowdowns and, on average, 96\% of
                 the loops that lead to speedups; it predicts the
                 speedups or slowdowns with an error of less than 20\%
                 for an average of 28\% of the loops across the
                 benchmarks and with an error of less than 50\% for an
                 average of 80\% of the loops. Overall, the framework
                 often outperforms, by as much as 25\%, a naive approach
                 that attempts to speculatively parallelize all the
                 loops considered, and is able to curb the large
                 slowdowns caused in many cases by this naive
                 approach.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "speculative multithreading; speculative
                 parallelization; thread-level speculation",
}

@Article{Elmas:2007:GRT,
  author =       "Tayfun Elmas and Shaz Qadeer and Serdar Tasiran",
  title =        "{Goldilocks}: a race and transaction-aware {Java}
                 runtime",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "245--255",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1273442.1250762",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Data races often result in unexpected and erroneous
                 behavior. In addition to causing data corruption and
                 leading programs to crash, the presence of data races
                 complicates the semantics of an execution which might
                 no longer be sequentially consistent. Motivated by
                 these observations, we have designed and implemented a
                 Java runtime system that monitors program executions
                 and throws a DataRaceException when a data race is
                 about to occur. Analogous to other runtime exceptions,
                 the DataRaceException provides two key benefits. First,
                 accesses causing race conditions are interrupted and
                 handled before they cause errors that may be difficult
                 to diagnose later. Second, if no DataRaceException is
                 thrown in an execution, it is guaranteed to be
                 sequentially consistent. This strong guarantee helps to
                 rule out many concurrency-related possibilities as the
                 cause of erroneous behavior. When a DataRaceException
                 is caught, the operation, thread, or program causing it
                 can be terminated gracefully. Alternatively, the
                 DataRaceException can serve as a conflict-detection
                 mechanism in optimistic uses of concurrency.\par

                 We start with the definition of data-race-free
                 executions in the Java memory model. We generalize this
                 definition to executions that use transactions in
                 addition to locks and volatile variables for
                 synchronization. We present a precise and efficient
                 algorithm for dynamically verifying that an execution
                 is free of data races. This algorithm generalizes the
                 Goldilocks algorithm for data-race detection by
                 handling transactions and providing the ability to
                 distinguish between read and write accesses. We have
                 implemented our algorithm and the DataRaceException in
                 the Kaffe Java Virtual Machine. We have evaluated our
                 system on a variety of publicly available Java
                 benchmarks and a few microbenchmarks that combine
                 lock-based and transaction-based synchronization. Our
                 experiments indicate that our implementation has
                 reasonable overhead. Therefore, we believe that in
                 addition to being a debugging tool, the
                 DataRaceException may be a viable mechanism to enforce
                 the safety of executions of multithreaded Java
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "data-race detection; Java runtime; runtime monitoring;
                 software transactions",
}

@Article{Emer:2007:STV,
  author =       "Joel Emer and Mark D. Hill and Yale N. Patt and Joshua
                 J. Yi and Derek Chiou and Resit Sendag",
  title =        "Single-Threaded vs. Multithreaded: Where Should We
                 Focus?",
  journal =      j-IEEE-MICRO,
  volume =       "27",
  number =       "6",
  pages =        "14--24",
  month =        nov # "\slash " # dec,
  year =         "2007",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2007.109",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Jul 2 21:58:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Emmi:2007:LA,
  author =       "Michael Emmi and Jeffrey S. Fischer and Ranjit Jhala
                 and Rupak Majumdar",
  title =        "Lock allocation",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "1",
  pages =        "291--296",
  month =        jan,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1190216.1190260",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:53:14 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We introduce lock allocation, an automatic technique
                 that takes a multi-threaded program annotated with
                 atomic sections (that must be executed atomically), and
                 infers a lock assignment from global variables to locks
                 and a lock instrumentation that determines where each
                 lock should be acquired and released such that the
                 resulting instrumented program is guaranteed to
                 preserve atomicity and deadlock freedom (provided all
                 shared state is accessed only within atomic sections).
                 Our algorithm works in the presence of pointers and
                 procedures, and sets up the lock allocation problem as
                 a 0-1 ILP which minimizes the conflict cost between
                 atomic sections while simultaneously minimizing the
                 number of locks. We have implemented our algorithm for
                 both C with pthreads and Java, and have applied it to
                 infer locks in 15K lines of AOLserver code. Our
                 automatic allocation produces the same results as hand
                 annotations for most of this code, while solving the
                 optimization instances within a second for most
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "atomicity; ILP; lock inference",
}

@Article{Eytani:2007:TFB,
  author =       "Yaniv Eytani and Klaus Havelund and Scott D. Stoller
                 and Shmuel Ur",
  title =        "Towards a framework and a benchmark for testing tools
                 for multi-threaded programs",
  journal =      j-CCPE,
  volume =       "19",
  number =       "3",
  pages =        "267--279",
  day =          "10",
  month =        mar,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1068",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:10 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "1 Aug 2006",
}

@Article{Gabor:2007:FES,
  author =       "Ron Gabor and Shlomo Weiss and Avi Mendelson",
  title =        "Fairness enforcement in switch on event
                 multithreading",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1275937.1275939",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The need to reduce power and complexity will increase
                 the interest in Switch On Event multithreading
                 (coarse-grained multithreading). Switch On Event
                 multithreading is a low-power and low-complexity
                 mechanism to improve processor throughput by switching
                 threads on execution stalls. Fairness may, however,
                 become a problem in a multithreaded processor. Unless
                 fairness is properly handled, some threads may starve
                 while others consume all of the processor cycles.
                 Heuristics that were devised in order to improve
                 fairness in simultaneous multithreading are not
                 applicable to Switch On Event multithreading. This
                 paper defines the fairness metric using the ratio of
                 the individual threads' speedups and shows how it can
                 be enforced in Switch On Event multithreading. Fairness
                 is controlled by forcing additional thread switch
                 points. These switch points are determined dynamically
                 by runtime estimation of the single threaded
                 performance of each of the individual threads. We
                 analyze the impact of the fairness enforcement
                 mechanism on aggregate IPC and weighted speedup. We
                 present simulation results of the performance of Switch
                 On Event multithreading. Switch On Event multithreading
                 achieves an average aggregate IPC increase of 26\% over
                 single thread and 12\% weighted speedup when no
                 fairness is enforced. In this case, a sixth of our runs
                 resulted in poor fairness in which one thread ran
                 extremely slowly (10 to 100 times slower than its
                 single-thread performance), while the other thread's
                 performance was hardly affected. By using the proposed
                 mechanism, we can guarantee fairness at different
                 levels of strictness and, in most cases, even improve
                 the weighted speedup.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "coarse-grained multithreading; fairness;
                 multithreading; performance; SOE; Switch on Event
                 multithreading; throughput; weighted speedup",
}

@Article{Ghoting:2007:CCF,
  author =       "Amol Ghoting and Gregory Buehrer and Srinivasan
                 Parthasarathy and Daehyun Kim and Anthony Nguyen and
                 Yen-Kuang Chen and Pradeep Dubey",
  title =        "Cache-conscious frequent pattern mining on modern and
                 emerging processors",
  journal =      j-VLDB-J,
  volume =       "16",
  number =       "1",
  pages =        "77--96",
  month =        jan,
  year =         "2007",
  CODEN =        "VLDBFR",
  ISSN =         "1066-8888 (print), 0949-877X (electronic)",
  ISSN-L =       "1066-8888",
  bibdate =      "Mon Jun 23 10:51:22 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Algorithms are typically designed to exploit the
                 current state of the art in processor technology.
                 However, as processor technology evolves, said
                 algorithms are often unable to derive the maximum
                 achievable performance on these modern architectures.
                 In this paper, we examine the performance of frequent
                 pattern mining algorithms on a modern processor. A
                 detailed performance study reveals that even the best
                 frequent pattern mining implementations, with highly
                 efficient memory managers, still grossly under-utilize
                 a modern processor. The primary performance bottlenecks
                 are {\em poor data locality\/} and {\em low instruction
                 level parallelism (ILP)}. We propose a {\em
                 cache-conscious prefix tree\/} to address this problem.
                 The resulting tree improves spatial locality and also
                 enhances the benefits from hardware cache line
                 prefetching. Furthermore, the design of this data
                 structure allows the use of {\em path tiling}, a novel
                 tiling strategy, to improve temporal locality. The
                 result is an overall speedup of up to 3.2 when compared
                 with state of the art implementations. We then show how
                 these algorithms can be improved further by realizing a
                 non-naive thread-based decomposition that targets {\em
                 simultaneously multi-threaded processors (SMT)}. A key
                 aspect of this decomposition is to ensure cache re-use
                 between threads that are co-scheduled at a fine
                 granularity. This optimization affords an additional
                 speedup of 50\%, resulting in an overall speedup of up
                 to 4.8. The proposed optimizations also provide
                 performance improvements on SMPs, and will most likely
                 be beneficial on emerging processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "VLDB Journal: Very Large Data Bases",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J869",
  keywords =     "architecture-conscious algorithms; association rule
                 mining; cache-conscious data mining; frequent itemset
                 mining; frequent pattern mining",
}

@Article{Goldwasser:2007:INP,
  author =       "Michael H. Goldwasser and David Letscher",
  title =        "Introducing network programming into a {CS1} course",
  journal =      j-SIGCSE,
  volume =       "39",
  number =       "3",
  pages =        "19--22",
  month =        sep,
  year =         "2007",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/1269900.1268793",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 16:57:36 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  note =         "Proceedings of the 12th Annual SIGCSE Conference on
                 Innovation and Technology in Computer Science Education
                 (ITiCSE'07).",
  abstract =     "Incorporating advanced programming concepts into an
                 introductory programming course has to be done
                 carefully to avoid overwhelming the students. We
                 describe our experiences doing network programming in a
                 CS1 course taught in Python. The simplicity of the
                 built-in libraries allowed a fair amount of networking
                 to be introduced in a week-long module of the course.
                 In this short time we had the students writing both
                 multithreaded clients and servers.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Gotsman:2007:TMS,
  author =       "Alexey Gotsman and Josh Berdine and Byron Cook and
                 Mooly Sagiv",
  title =        "Thread-modular shape analysis",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "266--277",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1273442.1250765",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present the first shape analysis for multithreaded
                 programs that avoids the explicit enumeration of
                 execution-interleavings. Our approach is to
                 automatically infer a resource invariant associated
                 with each lock that describes the part of the heap
                 protected by the lock. This allows us to use a
                 sequential shape analysis on each thread. We show that
                 resource invariants of a certain class can be
                 characterized as least fixed points and computed via
                 repeated applications of shape analysis only on each
                 individual thread. Based on this approach, we have
                 implemented a thread-modular shape analysis tool and
                 applied it to concurrent heap-manipulating code from
                 Windows device drivers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "abstract interpretation; concurrent programming; shape
                 analysis; static analysis",
}

@Article{Gravvanis:2007:PPA,
  author =       "George A. Gravvanis and Victor N. Epitropou and
                 Konstantinos M. Giannoutakis",
  title =        "On the performance of parallel approximate inverse
                 preconditioning using {Java} multithreading
                 techniques",
  journal =      j-APPL-MATH-COMP,
  volume =       "190",
  number =       "1",
  pages =        "255--270",
  day =          "1",
  month =        jul,
  year =         "2007",
  CODEN =        "AMHCBQ",
  ISSN =         "0096-3003 (print), 1873-5649 (electronic)",
  ISSN-L =       "0096-3003",
  bibdate =      "Sat Jul 12 09:03:06 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/00963003",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Mathematics and Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00963003",
}

@Article{Hur:2007:MSM,
  author =       "Ibrahim Hur and Calvin Lin",
  title =        "Memory scheduling for modern microprocessors",
  journal =      j-TOCS,
  volume =       "25",
  number =       "4",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2007",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1314299.1314301",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:15 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The need to carefully schedule memory operations has
                 increased as memory performance has become increasingly
                 important to overall system performance. This article
                 describes the adaptive history-based (AHB) scheduler,
                 which uses the history of recently scheduled operations
                 to provide three conceptual benefits: (1) it allows the
                 scheduler to better reason about the delays associated
                 with its scheduling decisions, (2) it provides a
                 mechanism for combining multiple constraints, which is
                 important for increasingly complex DRAM structures, and
                 (3) it allows the scheduler to select operations so
                 that they match the program's mixture of Reads and
                 Writes, thereby avoiding certain bottlenecks within the
                 memory controller.\par

                 We have previously evaluated this scheduler in the
                 context of the IBM Power5. When compared with the state
                 of the art, this scheduler improves performance by
                 15.6\\%, 9.9\\%, and 7.6\\% for the Stream, NAS, and
                 commercial benchmarks, respectively. This article
                 expands our understanding of the AHB scheduler in a
                 variety of ways. Looking backwards, we describe the
                 scheduler in the context of prior work that focused
                 exclusively on avoiding bank conflicts, and we show
                 that the AHB scheduler is superior for the IBM Power5,
                 which we argue will be representative of future
                 microprocessor memory controllers. Looking forwards, we
                 evaluate this scheduler in the context of future
                 systems by varying a number of microarchitectural
                 features and hardware parameters. For example, we show
                 that the benefit of this scheduler increases as we move
                 to multithreaded environments.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "adaptive history-based scheduling; memory scheduling;
                 memory system performance",
}

@InBook{Kollias:2007:APC,
  author =       "Giorgos Kollias and Efstratios Gallopoulos",
  title =        "Asynchronous {PageRank} computation in an interactive
                 multithreading environment",
  volume =       "07071",
  publisher =    "International Begegnungs- und Forschungszentrum
                 f{\"u}r Informatik",
  address =      "Wadern, Germany",
  pages =        "????",
  year =         "2007",
  ISBN =         "????",
  ISBN-13 =      "????",
  bibdate =      "Fri Feb 19 15:32:30 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pagerank.bib",
  series =       "Dagstuhl seminar proceedings",
  URL =          "http://drops.dagstuhl.de/opus/volltexte/2007/1065/pdf/07071.KolliasGiorgios.Paper.1065",
  acknowledgement = ack-nhfb,
}

@Article{Kumar:2007:ESI,
  author =       "Nagendra J. Kumar and Vasanth Asokan and Siddhartha
                 Shivshankar and Alexander G. Dean",
  title =        "Efficient software implementation of embedded
                 communication protocol controllers using asynchronous
                 software thread integration with time- and
                 space-efficient procedure calls",
  journal =      j-TECS,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1210268.1210270",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jun 12 15:20:58 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The overhead of context switching limits efficient
                 scheduling of multiple concurrent threads on a
                 uniprocessor when real-time requirements exist. A
                 software-implemented protocol controller may be
                 crippled by this problem. The available idle time may
                 be too short to recover through context switching, so
                 only the primary thread can execute during message
                 activity, slowing the secondary threads and potentially
                 missing deadlines. Asynchronous software thread
                 integration (ASTI) uses coroutine calls and
                 integration, letting threads make independent progress
                 efficiently, and reducing the needed context switches.
                 We demonstrate the methods with a software
                 implementation of an automotive communication protocol
                 (J1850) and several secondary threads.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
  keywords =     "asynchronous software thread integration; fine-grain
                 concurrency; hardware to software migration; J1850;
                 software-implemented communication protocol
                 controllers",
}

@Article{Laudon:2007:CWM,
  author =       "James Laudon and Lawrence Spracklen",
  title =        "The Coming Wave of Multithreaded Chip
                 Multiprocessors",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "35",
  number =       "3",
  pages =        "299--330",
  month =        jun,
  year =         "2007",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0033-6",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:06:21 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=3&spage=299",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Chip multiprocessing; multithreading; parallel
                 programming; performance",
}

@Article{Le:2007:IPM,
  author =       "H. Q. Le and W. J. Starke and J. S. Fields and F. P.
                 O'Connell and D. Q. Nguyen and B. J. Ronchetti and W.
                 M. Sauer and E. M. Schwarz and M. T. Vaden",
  title =        "{IBM POWER6} microarchitecture",
  journal =      j-IBM-JRD,
  volume =       "51",
  number =       "6",
  pages =        "639--??",
  month =        nov,
  year =         "2007",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Mon Jul 7 21:49:07 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/rd/516/le.html",
  abstract =     "This paper describes the implementation of the IBM
                 POWER6 microprocessor, a two-way simultaneous
                 multithreaded (SMT) dual-core chip whose key features
                 include binary compatibility with IBM POWER5
                 microprocessor-based systems; increased functional
                 capabilities, such as decimal floating-point and vector
                 multimedia extensions; significant reliability,
                 availability, and serviceability enhancements; and
                 robust scalability with up to 64 physical processors.
                 Based on a new industry-leading high-frequency core
                 architecture with enhanced SMT and driven by a
                 high-throughput symmetric multiprocessing (SMP) cache
                 and memory subsystem, the POWER6 chip achieves a
                 significant performance boost compared with its
                 predecessor, the POWER5 chip. Key extensions to the
                 coherence protocol enable POWER6 microprocessor-based
                 systems to achieve better SMP scalability while
                 enabling reductions in system packaging complexity and
                 cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Leadbitter:2007:NM,
  author =       "P. Leadbitter and D. Page and N. P. Smart",
  title =        "Nondeterministic Multithreading",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "56",
  number =       "7",
  pages =        "992--998",
  month =        jul,
  year =         "2007",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2007.1049",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon Jul 4 15:03:40 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4216296",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Li:2007:CET,
  author =       "Peng Li and Steve Zdancewic",
  title =        "Combining events and threads for scalable network
                 services implementation and evaluation of monadic,
                 application-level concurrency primitives",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "189--199",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1273442.1250756",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper proposes to combine two seemingly opposed
                 programming models for building massively concurrent
                 network services: the event-driven model and the
                 multithreaded model. The result is a hybrid design that
                 offers the best of both worlds--the ease of use and
                 expressiveness of threads and the flexibility and
                 performance of events.\par

                 This paper shows how the hybrid model can be
                 implemented entirely at the application level using
                 concurrency monads in Haskell, which provides type-safe
                 abstractions for both events and threads. This approach
                 simplifies the development of massively concurrent
                 software in a way that scales to real-world network
                 services. The Haskell implementation supports
                 exceptions, symmetrical multiprocessing, software
                 transactional memory, asynchronous I/O mechanisms and
                 application-level network protocol stacks. Experimental
                 results demonstrate that this monad-based approach has
                 good performance: the threads are extremely lightweight
                 (scaling to ten million threads), and the I/O
                 performance compares favorably to that of Linux NPTL.
                 tens of thousands of simultaneous, mostly-idle client
                 connections. Such massively-concurrent programs are
                 difficult to implement, especially when other
                 requirements, such as high performance and strong
                 security, must also be met.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; event; Haskell; implementation; monad;
                 networking; programming; scalability; thread",
}

@Article{Madan:2007:PEA,
  author =       "Niti Madan and Rajeev Balasubramonian",
  title =        "Power Efficient Approaches to Redundant
                 Multithreading",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "18",
  number =       "8",
  pages =        "1066--1079",
  month =        aug,
  year =         "2007",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2007.1090",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 14:26:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Mahesri:2007:HSS,
  author =       "Aqeel Mahesri and Nicholas J. Wang and Sanjay J.
                 Patel",
  title =        "Hardware support for software controlled
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "3--12",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241606",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Chip multi-processors have emerged as one of the most
                 effective uses of the huge number of transistors
                 available today and in the future, but questions remain
                 as to the best way to leverage CMPs to accelerate
                 single threaded applications. Previous approaches rely
                 on significant speculation to accomplish this goal. Our
                 proposal, NXA, is less speculative than previous
                 proposals, relying heavily on software to guarantee
                 thread correctness, though still allowing parallelism
                 in the presence of ambiguous dependences. It divides a
                 single thread of execution into multiple using the
                 master-worker paradigm where some set of master threads
                 execute code that spawns tasks for other, worker
                 threads. The master threads generally consist of
                 performance critical instructions that can prefetch
                 data, compute critical control decisions, or compute
                 performance critical dataflow slices. This prevents
                 non-critical instructions from competing with critical
                 instructions for processor resources, allowing the
                 critical thread (and thus the workload) to complete
                 faster. Empirical results from performance simulation
                 show a 20\% improvement in performance on a 2-way CMP
                 machine, demonstrating that software controlled
                 multithreading can indeed provide a benefit in the
                 presence of hardware support.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "{DASCMP'06}",
}

@Article{Marowka:2007:PCD,
  author =       "Ami Marowka",
  title =        "Parallel computing on any desktop",
  journal =      j-CACM,
  volume =       "50",
  number =       "9",
  pages =        "74--78",
  month =        sep,
  year =         "2007",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/1284621.1284622",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Mon Jun 16 18:32:57 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Parallelization lets applications exploit the high
                 throughput of new multicore processors, and the OpenMP
                 parallel programming model helps developers create
                 multithreaded applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Minh:2007:EHT,
  author =       "Chi Cao Minh and Martin Trautmann and JaeWoong Chung
                 and Austen McDonald and Nathan Bronson and Jared Casper
                 and Christos Kozyrakis and Kunle Olukotun",
  title =        "An effective hybrid transactional memory system with
                 strong isolation guarantees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "69--80",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250673",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We propose signature-accelerated transactional memory
                 (SigTM), a hybrid TM system that reduces the overhead
                 of software transactions. SigTM uses hardware
                 signatures to track the read-set and write-set for
                 pending transactions and perform conflict detection
                 between concurrent threads. All other transactional
                 functionality, including data versioning, is
                 implemented in software. Unlike previously proposed
                 hybrid TM systems, SigTM requires no modifications to
                 the hardware caches, which reduces hardware cost and
                 simplifies support for nested transactions and
                 multithreaded processor cores. SigTM is also the first
                 hybrid TM system to provide strong isolation guarantees
                 between transactional blocks and non-transactional
                 accesses without additional read and write barriers in
                 non-transactional code.\par

                 Using a set of parallel programs that make frequent use
                 of coarse-grain transactions, we show that SigTM
                 accelerates software transactions by 30\% to 280\%. For
                 certain workloads, SigTM can match the performance of a
                 full-featured hardware TM system, while for workloads
                 with large read-sets it can be up to two times slower.
                 Overall, we show that SigTM combines the performance
                 characteristics and strong isolation guarantees of
                 hardware TM implementations with the low cost and
                 flexibility of software TM systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "multi-core architectures; parallel programming; strong
                 isolation; transactional memory",
}

@Article{Morandini:2007:UDS,
  author =       "Marco Morandini and Paolo Mantegazza",
  title =        "Using dense storage to solve small sparse linear
                 systems",
  journal =      j-TOMS,
  volume =       "33",
  number =       "1",
  pages =        "5:1--5:12",
  month =        mar,
  year =         "2007",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1206040.1206045",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Sat Apr 14 09:48:58 MDT 2007",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "A data structure is used to build a linear solver
                 specialized for relatively small sparse systems. The
                 proposed solver, optimized for run-time performance at
                 the expense of memory footprint, outperforms widely
                 used direct and sparse solvers for systems with between
                 100 and 3000 equations. A multithreaded version of the
                 solver is shown to give some speedups for problems with
                 medium fill-in, while it does not give any benefit for
                 very sparse problems.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Musuvathi:2007:ICB,
  author =       "Madanlal Musuvathi and Shaz Qadeer",
  title =        "Iterative context bounding for systematic testing of
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "446--455",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1273442.1250785",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreaded programs are difficult to get right
                 because of unexpected interaction between concurrently
                 executing threads. Traditional testing methods are
                 inadequate for catching subtle concurrency errors which
                 manifest themselves late in the development cycle and
                 post-deployment. Model checking or systematic
                 exploration of program behavior is a promising
                 alternative to traditional testing methods. However, it
                 is difficult to perform systematic search on large
                 programs as the number of possible program behaviors
                 grows exponentially with the program size. Confronted
                 with this state-explosion problem, traditional model
                 checkers perform iterative depth-bounded search.
                 Although effective for message-passing software,
                 iterative depth-bounding is inadequate for
                 multithreaded software.\par

                 This paper proposes iterative context-bounding, a new
                 search algorithm that systematically explores the
                 executions of a multithreaded program in an order that
                 prioritizes executions with fewer context switches. We
                 distinguish between preempting and nonpreempting
                 context switches, and show that bounding the number of
                 preempting context switches to a small number
                 significantly alleviates the state explosion, without
                 limiting the depth of explored executions. We show both
                 theoretically and empirically that context-bounded
                 search is an effective method for exploring the
                 behaviors of multithreaded programs. We have
                 implemented our algorithm in two model checkers and
                 applied it to a number of real-world multithreaded
                 programs. Our implementation uncovered 9 previously
                 unknown bugs in our benchmarks, each of which was
                 exposed by an execution with at most 2 preempting
                 context switches. Our initial experience with the
                 technique is encouraging and demonstrates that
                 iterative context-bounding is a significant improvement
                 over existing techniques for testing multithreaded
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; context-bounding; model checking;
                 multithreading; partial-order reduction; shared-memory
                 programs; software testing",
}

@Article{Naik:2007:CMA,
  author =       "Mayur Naik and Alex Aiken",
  title =        "Conditional must not aliasing for static race
                 detection",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "1",
  pages =        "327--338",
  month =        jan,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1190216.1190265",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:53:14 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Race detection algorithms for multi-threaded programs
                 using the common lock-based synchronization idiom must
                 correlate locks with the memory locations they guard.
                 The heart of a proof of race freedom is showing that if
                 two locks are distinct, then the memory locations they
                 guard are also distinct. This is an example of a
                 general property we call conditional must not aliasing:
                 Under the assumption that two objects are not aliased,
                 prove that two other objects are not aliased. This
                 paper introduces and gives an algorithm for conditional
                 must not alias analysis and discusses experimental
                 results for sound race detection of Java programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; Java; multi-threading; static race
                 detection; synchronization",
}

@Article{Narayanasamy:2007:ACB,
  author =       "Satish Narayanasamy and Zhenghao Wang and Jordan
                 Tigani and Andrew Edwards and Brad Calder",
  title =        "Automatically classifying benign and harmful data
                 races all using replay analysis",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "22--31",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1250734.1250738",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Many concurrency bugs in multi-threaded programs are
                 due to dataraces. There have been many efforts to
                 develop static and dynamic mechanisms to automatically
                 find the data races. Most of the prior work has focused
                 on finding the data races and eliminating the false
                 positives.\par

                 In this paper, we instead focus on a dynamic analysis
                 technique to automatically classify the data races into
                 two categories --- the dataraces that are potentially
                 benign and the data races that are potentially harmful.
                 A harmful data race is a real bug that needs to be
                 fixed. This classification is needed to focus the
                 triaging effort on those data races that are
                 potentially harmful. Without prioritizing the data
                 races we have found that there are too many data races
                 to triage. Our second focus is to automatically provide
                 to the developer a reproducible scenario of the data
                 race, which allows the developer to understand the
                 different effects of a harmful data race on a program's
                 execution.\par

                 To achieve the above, we record a multi-threaded
                 program's execution in a replay log. The replay log is
                 used to replay the multi-threaded program, and during
                 replay we find the data races using a happens-before
                 based algorithm. To automatically classify if a data
                 race that we find is potentially benign or potentially
                 harmful, were play the execution twice for a given data
                 race --- one for each possible order between the
                 conflicting memory operations. If the two replays for
                 the two orders produce the same result, then we
                 classify the data race to be potentially benign. We
                 discuss our experiences in using our replay based
                 dynamic data race checker on several Microsoft
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "benign data races; concurrency Bbugs; replay",
}

@Article{Ostler:2007:IHT,
  author =       "Chris Ostler and Karam S. Chatha and Vijay Ramamurthi
                 and Krishnan Srinivasan",
  title =        "{ILP} and heuristic techniques for system-level design
                 on network processor architectures",
  journal =      j-TODAES,
  volume =       "12",
  number =       "4",
  pages =        "48:1--48:??",
  month =        sep,
  year =         "2007",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1278349.1278361",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Thu Jun 12 18:09:35 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Network processors incorporate several architectural
                 features, including symmetric multiprocessing (SMP),
                 block multithreading, and multiple memory elements, to
                 support the high-performance requirements of current
                 day applications. This article presents automated
                 system-level design techniques for application
                 development on such architectures. We propose integer
                 linear programming formulations and heuristic
                 techniques for process allocation and data mapping on
                 SMP and block-multithreading-based network processors.
                 The techniques incorporate process transformations and
                 multithreading-aware data mapping to maximize the
                 throughput of the application. The article presents
                 experimental results that evaluate the techniques by
                 implementing network processing applications on the
                 Intel IXP 2400 architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems (TODAES)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
  keywords =     "block multithreading; multiprocessor",
}

@Article{Park:2007:MEP,
  author =       "Soyeon Park and Weihang Jiang and Yuanyuan Zhou and
                 Sarita Adve",
  title =        "Managing energy-performance tradeoffs for
                 multithreaded applications on multiprocessor
                 architectures",
  journal =      j-SIGMETRICS,
  volume =       "35",
  number =       "1",
  pages =        "169--180",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1254882.1254902",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Jun 27 09:42:48 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In modern computers, non-performance metrics such as
                 energy consumption have become increasingly important,
                 requiring tradeoff with performance. A recent work has
                 proposed performance-guaranteed energy management, but
                 it is designed specifically for sequential applications
                 and cannot be used to a large class of multithreaded
                 applications running on high end computers and data
                 servers.\par

                 To address the above problem, this paper makes the
                 first attempt to provide performance-guaranteed energy
                 management for multithreaded applications on
                 multiprocessor architectures. We first conduct a
                 comprehensive study on the effects of energy adaptation
                 on thread synchronizations and show that a
                 multithreaded application suffers from not only local
                 slowdowns due to energy adaptation, but also
                 significant slowdowns propagated from other threads
                 because of synchronization. Based on these findings, we
                 design three Synchronization-Aware (SA) algorithms, LWT
                 (Lock Waiting Time-based), CSL (Critical Section
                 Length-based) and ODP (Operation Delay
                 Propagation-based) algorithms, to estimate the energy
                 adaptation-induced slowdowns on each thread. The local
                 slowdowns are then combined across multiple threads via
                 three aggregation methods (MAX, AVG and SUM) to
                 estimate the overall application slowdown.\par

                 We evaluate our methods using a large multithreaded
                 commercial application, IBM DB2 with
                 industrial-strength online transaction processing
                 (OLTP) workloads, and six SPLASH parallel scientific
                 applications. Our experimental results show that LWT
                 combined with the MAX aggregation method not only
                 controls the performance slow down within the specified
                 limits but also conserves the most energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "energy and performance tradeoffs; low power design;
                 memory energy management; multithreaded applications",
}

@Article{Permandla:2007:TSP,
  author =       "Pratibha Permandla and Michael Roberson and
                 Chandrasekhar Boyapati",
  title =        "A type system for preventing data races and deadlocks
                 in the {Java Virtual Machine} language: 1",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "7",
  pages =        "10--10",
  month =        jul,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1254766.1254768",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:57:50 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In previous work on SafeJava we presented a type
                 system extension to the Java source language that
                 statically prevents data races and deadlocks in
                 multithreaded programs. SafeJava is expressive enough
                 to support common programming patterns, its type
                 checking is fast and scalable, and it requires little
                 programming overhead. SafeJava thus offers a promising
                 approach for making multithreaded programs more
                 reliable. This paper presents a corresponding type
                 system extension for the Java virtual machine language
                 (JVML). We call the resulting language SafeJVML.
                 Well-typed SafeJVML programs are guaranteed to be free
                 of data races and deadlocks. Designing a corresponding
                 type system for JVML is important because most Java
                 code is shipped in the JVML format. Designing a
                 corresponding type system for JVML is nontrivial
                 because of important differences between Java and JVML.
                 In particular, the absence of block structure in JVML
                 programs and the fact that they do not use named local
                 variables the way Java programs do make the type
                 systems for Java and JVML significantly different. For
                 example, verifying absence of races and deadlocks in
                 JVML programs requires performing an alias analysis,
                 something that was not necessary for verifying absence
                 of races and deadlocks in Java programs. This paper
                 presents static and dynamic semantics for Safe JVML. It
                 also includes a proof that the SafeJVML type system is
                 sound and that it prevents data races and deadlocks. To
                 the best of our knowledge, this is the first type
                 system for JVML that statically ensures absence of
                 synchronization errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "data races; deadlocks; ownership types; SafeJava",
}

@Article{Pozniansky:2007:MEF,
  author =       "Eli Pozniansky and Assaf Schuster",
  title =        "{MultiRace}: efficient on-the-fly data race detection
                 in multithreaded {C++} programs",
  journal =      j-CCPE,
  volume =       "19",
  number =       "3",
  pages =        "327--340",
  day =          "10",
  month =        mar,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1064",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:10 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "1 Aug 2006",
}

@Article{Rosu:2007:ITO,
  author =       "Grigore Ro{\c{s}}u and Koushik Sen",
  title =        "An instrumentation technique for online analysis of
                 multithreaded programs",
  journal =      j-CCPE,
  volume =       "19",
  number =       "3",
  pages =        "311--325",
  day =          "10",
  month =        mar,
  year =         "2007",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1066",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:10 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "1 Aug 2006",
}

@Article{Sharkey:2007:EOA,
  author =       "Joseph J. Sharkey and Dmitry V. Ponomarev",
  title =        "Exploiting Operand Availability for Efficient
                 Simultaneous Multithreading",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "56",
  number =       "2",
  pages =        "208--223",
  month =        feb,
  year =         "2007",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2007.28",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon Jul 4 15:03:37 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4042681",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Shi:2007:CCP,
  author =       "Xudong Shi and Feiqi Su and Jih-kwon Peir and Ye Xia
                 and Zhen Yang",
  title =        "{CMP} cache performance projection: accessibility vs.
                 capacity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "13--20",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241607",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Efficient utilizing on-chip storage space on
                 Chip-Multiprocessors (CMPs) has become an important
                 research topic. Tradeoffs between data accessibility
                 and effective on-chip capacity have been studied
                 extensively. It requires costly simulations to
                 understand a wide-spectrum of the design space. In this
                 paper, we first develop an abstract model for
                 understanding the performance impact with respect to
                 data replication. To overcome the lack of real-time
                 interactions among multiple cores in the abstract
                 model, we propose a global stack simulation strategy to
                 study the performance of a variety of cache
                 organizations on CMPs. The global stack logically
                 incorporates a shared stack and per-core private stacks
                 to collect shared/private reuse (stack) distances for
                 every memory reference in a single simulation pass.
                 With the collected reuse distances, performance in
                 terms of hits/misses and average memory access times
                 can be calculated for various cache organizations. We
                 verify the stack results against individual
                 execution-driven simulations that consider realistic
                 cache parameters and delays using a set of commercial
                 multithreaded workloads. The results show that stack
                 simulations can accurately model the performance of
                 various cache organizations. The single-pass stack
                 simulation results demonstrate that the effectiveness
                 of various techniques for optimizing the CMP on-chip
                 storage is closely related to the working sets of the
                 workloads as well as to the total cache sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "CMP caches; data replication; performance modeling and
                 projection; stack simulation",
  remark =       "{DASCMP'06}",
}

@Article{Smaragdakis:2007:TIC,
  author =       "Yannis Smaragdakis and Anthony Kay and Reimer Behrends
                 and Michal Young",
  title =        "Transactions with isolation and cooperation",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "10",
  pages =        "191--210",
  month =        oct,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1297027.1297042",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:00:28 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present the TIC (Transactions with Isolation and
                 Cooperation) model for concurrent programming. TIC adds
                 to standard transactional memory the ability for a
                 transaction to observe the effects of other threads at
                 selected points. This allows transactions to cooperate,
                 as well as to invoke nonrepeatable or irreversible
                 operations, such as I/O. Cooperating transactions run
                 the danger of exposing intermediate state and of having
                 other threads change the transaction's state. The TIC
                 model protects against unanticipated interference by
                 having the type system keep track of all operations
                 that may (transitively) violate the atomicity of a
                 transaction and require the programmer to establish
                 consistency at appropriate points. The result is a
                 programming model that is both general and simple. We
                 have used the TIC model to re-engineer existing
                 lock-based applications including a substantial
                 multi-threaded web mail server and a memory allocator
                 with coarse-grained locking. Our experience confirms
                 the features of the TIC model: It is convenient for the
                 programmer, while maintaining the benefits of
                 transactional memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "nested transactions; open-nesting; punctuation; TIC;
                 transactional memory",
}

@Book{Sweetman:2007:SMR,
  author =       "Dominic Sweetman",
  title =        "See {MIPS} Run",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  edition =      "Second",
  pages =        "xix + 492",
  year =         "2007",
  ISBN =         "0-12-088421-6",
  ISBN-13 =      "978-0-12-088421-6",
  LCCN =         "QA76.9.A73 S88 2007",
  bibdate =      "Thu Jun 20 10:21:55 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/linux.bib;
                 https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Embedded computer systems --- Programming; MIPS
                 (Computer architecture); RISC microprocessors",
  libnote =      "Not yet in my library.",
  tableofcontents = "1: RISCs and MIPS architectures / 1 \\
                 2: MIPS architecture / 29 \\
                 3: Coprocessor 0: MIPS processor control / 53 \\
                 4: How caches work on MIPS processors / 79 \\
                 5: Exceptions, interrupts, and initialization / 105 \\
                 6: Low-level memory management and the TLB / 131 \\
                 7: Floating-point support / 151 \\
                 8: Complete guide to the MIPS instruction set / 183 \\
                 9: Reading MIPS assembly language / 263 \\
                 10: Porting software to the MIPS architecture / 279 \\
                 11: MIPS software standards (ABIs) / 311 \\
                 12: Debugging MIPS designs - debug and profiling
                 features / 339 \\
                 13: GNU/Linux from eight miles high / 363 \\
                 14: How hardware and software work together / 371 \\
                 15: MIPS specific issues in the Linux kernel / 399 \\
                 16: Linux application code, PIC, and libraries / 409
                 \\
                 Appendix A: MIPS multithreading / 415 \\
                 Appendix B: Other optional extensions to the MIPS
                 instruction set",
}

@Article{Tam:2007:TCS,
  author =       "David Tam and Reza Azimi and Michael Stumm",
  title =        "Thread clustering: sharing-aware scheduling on
                 {SMP--CMP--SMT} multiprocessors",
  journal =      j-OPER-SYS-REV,
  volume =       "41",
  number =       "3",
  pages =        "47--58",
  month =        jun,
  year =         "2007",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1272996.1273004",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:16:31 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The major chip manufacturers have all introduced chip
                 multiprocessing (CMP) and simultaneous multithreading
                 (SMT) technology into their processing units. As a
                 result, even low-end computing systems and game
                 consoles have become shared memory multiprocessors with
                 L1 and L2 cache sharing within a chip. Mid- and
                 large-scale systems will have multiple processing chips
                 and hence consist of an SMP-CMP-SMT configuration with
                 non-uniform data sharing overheads. Current operating
                 system schedulers are not aware of these new cache
                 organizations, and as a result, distribute threads
                 across processors in a way that causes many
                 unnecessary, long-latency cross-chip cache
                 accesses.\par

                 In this paper we describe the design and implementation
                 of a scheme to schedule threads based on sharing
                 patterns detected online using features of standard
                 performance monitoring units (PMUs) available in
                 today's processing units. The primary advantage of
                 using the PMU infrastructure is that it is fine-grained
                 (down to the cache line) and has relatively low
                 overhead. We have implemented our scheme in Linux
                 running on an 8- way Power5 SMP-CMP-SMT
                 multi-processor. For commercial multithreaded server
                 workloads (VolanoMark, SPECjbb, and RUBiS), we are able
                 to demonstrate reductions in cross-chip cache accesses
                 of up to 70\%. These reductions lead to
                 application-reported performance improvements of up to
                 7\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "affinity scheduling; cache behavior; cache locality;
                 CMP; detecting sharing; hardware performance counters;
                 hardware performance monitors; multithreading;
                 performance monitoring unit; resource allocation;
                 shared caches; sharing; simultaneous multithreading;
                 single-chip multiprocessors; SMP; SMT; thread
                 migration; thread placement; thread scheduling",
}

@Article{Walcott:2007:DPA,
  author =       "Kristen R. Walcott and Greg Humphreys and Sudhanva
                 Gurumurthi",
  title =        "Dynamic prediction of architectural vulnerability from
                 microarchitectural state",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "516--527",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250726",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Transient faults due to particle strikes are a key
                 challenge in microprocessor design. Driven by
                 exponentially increasing transistor counts, per-chip
                 faults are a growing burden. To protect against soft
                 errors, redundancy techniques such as redundant
                 multithreading (RMT) are often used. However, these
                 techniques assume that the probability that a
                 structural fault will result in a soft error (i.e., the
                 Architectural Vulnerability Factor (AVF)) is 100
                 percent, unnecessarily draining processor resources.
                 Due to the high cost of redundancy, there have been
                 efforts to throttle RMT at runtime. To date, these
                 methods have not incorporated an AVF model and
                 therefore tend to be ad hoc. Unfortunately, computing
                 the AVF of complex microprocessor structures (e.g., the
                 ISQ) can be quite involved.\par

                 To provide probabilistic guarantees about fault
                 tolerance, we have created a rigorous characterization
                 of AVF behavior that can be easily implemented in
                 hardware. We experimentally demonstrate AVF variability
                 within and across the SPEC2000 benchmarks and identify
                 strong correlations between structural AVF values and a
                 small set of processor metrics. Using these simple
                 indicators as predictors, we create a proof-of-concept
                 RMT implementation that demonstrates that AVF
                 prediction can be used to maintain a low fault
                 tolerance level without significant performance
                 impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "architecture vulnerability factor; microarchitecture;
                 performance; redundant multithreading; reliability",
}

@Article{Wang:2007:EAP,
  author =       "Perry H. Wang and Jamison D. Collins and Gautham N.
                 Chinya and Hong Jiang and Xinmin Tian and Milind Girkar
                 and Nick Y. Yang and Guei-Yuan Lueh and Hong Wang",
  title =        "{EXOCHI}: architecture and programming environment for
                 a heterogeneous multi-core multithreaded system",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "6",
  pages =        "156--166",
  month =        jun,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1250734.1250753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:55:30 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Future mainstream microprocessors will likely
                 integrate specialized accelerators, such as GPUs, onto
                 a single die to achieve better performance and power
                 efficiency. However, it remains a keen challenge to
                 program such a heterogeneous multicore platform, since
                 these specialized accelerators feature ISAs and
                 functionality that are significantly different from the
                 general purpose CPU cores. In this paper, we present
                 EXOCHI: (1) Exoskeleton Sequencer (EXO), an
                 architecture to represent heterogeneous accelerators as
                 ISA-based MIMD architecture resources, and a shared
                 virtual memory heterogeneous multithreaded program
                 execution model that tightly couples specialized
                 accelerator cores with general-purpose CPU cores, and
                 (2) C for Heterogeneous Integration (CHI), an
                 integrated C/C++ programming environment that supports
                 accelerator-specific inline assembly and
                 domain-specific languages. The CHI compiler extends the
                 OpenMP pragma for heterogeneous multithreading
                 programming, and produces a single fat binary with code
                 sections corresponding to different instruction sets.
                 The runtime can judiciously spread parallel computation
                 across the heterogeneous cores to optimize performance
                 and power.\par

                 We have prototyped the EXO architecture on a physical
                 heterogeneous platform consisting of an Intel{\reg}
                 Core{\TM} 2 Duo processor and an 8-core 32-thread
                 Intel{\reg} Graphics Media Accelerator X3000. In
                 addition, we have implemented the CHI integrated
                 programming environment with the Intel{\reg} C++
                 Compiler, runtime toolset, and debugger. On the EXO
                 prototype system, we have enhanced a suite of
                 production-quality media kernels for video and image
                 processing to utilize the accelerator through the CHI
                 programming interface, achieving significant speedup
                 (1.41X to10.97X) over execution on the IA32 CPU
                 alone.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "GPU; heterogeneous multi-cores; openMP",
}

@Article{Wang:2007:OSC,
  author =       "Qin Wang and Junpu Chen and Weihua Zhang and Min Yang
                 and Binyu Zang",
  title =        "Optimizing software cache performance of packet
                 processing applications",
  journal =      j-SIGPLAN,
  volume =       "42",
  number =       "7",
  pages =        "227--236",
  month =        jul,
  year =         "2007",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1273444.1254808",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 10:57:50 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Network processors (NPs) are widely used in many types
                 of networking equipment due to their high performance
                 and flexibility. For most NPs, software cache is used
                 instead of hardware cache due to the chip area, cost
                 and power constraints. Therefore, programmers should
                 take full responsibility for software cache management
                 which is neither intuitive nor easy to most of them.
                 Actually, without an effective use of it, long memory
                 access latency will be a critical limiting factor to
                 overall applications. Prior researches like hardware
                 multi-threading, wide-word accesses and packet access
                 combination for caching have already been applied to
                 help programmers to overcome this bottleneck. However,
                 most of them do not make enough use of the
                 characteristics of packet processing applications and
                 often perform intraprocedural optimizations only. As a
                 result, the binary codes generated by those techniques
                 often get lower performance than that comes from
                 hand-tuned assembly programming for some applications.
                 In this paper, we propose an algorithm including two
                 techniques --- Critical Path Based Analysis (CPBA) and
                 Global Adaptive Localization (GAL), to optimize the
                 software cache performance of packet processing
                 applications. Packet processing applications usually
                 have several hot paths and CPBA tries to insert
                 localization instructions according to their execution
                 frequencies. For further optimizations, GAL eliminates
                 some redundant localization instructions by
                 interprocedural analysis and optimizations. Our
                 algorithm is applied on some representative
                 applications. Experiment results show that it leads to
                 an average speedup by a factor of 1.974.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "local memory; network processor; optimization",
}

@Article{Yan:2007:HMC,
  author =       "Jun Yan and Wei Zhang",
  title =        "Hybrid multi-core architecture for boosting
                 single-threaded performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "141--148",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241603",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The scaling of technology and the diminishing return
                 of complicated uniprocessors have driven the industry
                 towards multicore processors. While multithreaded
                 applications can naturally leverage the enhanced
                 throughput of multi-core processors, a large number of
                 important applications are single-threaded, which
                 cannot automatically harness the potential of
                 multi-core processors. In this paper, we propose a
                 compiler-driven heterogeneous multicore architecture,
                 consisting of tightly-integrated VLIW (Very Long
                 Instruction Word) and superscalar processors on a
                 single chip, to automatically boost the performance of
                 single-threaded applications without compromising the
                 capability to support multithreaded programs. In the
                 proposed multi-core architecture, while the
                 high-performance VLIW core is used to run code segments
                 with high instruction-level parallelism (ILP) extracted
                 by the compiler; the superscalar core can be exploited
                 to deal with the runtime events that are typically
                 difficult for the VLIW core to handle, such as L2 cache
                 misses. Our initial experimental results by running the
                 preexecution thread on the superscalar core to mitigate
                 the L2 cache misses of the main thread on the VLIW core
                 indicate that the proposed VLIW/superscalar multi-core
                 processor can automatically improve the performance of
                 single-threaded general-purpose applications by up to
                 40.8\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Yang:2007:RUL,
  author =       "Jin-Min Yang and Da-Fang Zhang and Xue-Dong Yang and
                 Wen-Wei Li",
  title =        "Reliable user-level rollback recovery implementation
                 for multithreaded processes on windows",
  journal =      j-SPE,
  volume =       "37",
  number =       "3",
  pages =        "331--346",
  month =        mar,
  year =         "2007",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.771",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Wed Oct 17 18:33:14 MDT 2007",
  bibsource =    "http://www.interscience.wiley.com/jpages/0038-0644;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Software---Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "24 Oct 2006",
}

@Article{Zebchuk:2007:BBC,
  author =       "J. Zebchuk and A. Moshovos",
  title =        "A Building Block for Coarse-Grain Optimizations in the
                 On-Chip Memory Hierarchy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "33--36",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Current on-chip block-centric memory hierarchies
                 exploit access patterns at the fine-grain scale of
                 small blocks. Several recently proposed memory
                 hierarchy enhancements for coherence traffic reduction
                 and prefetching suggest that additional useful patterns
                 emerge with a macroscopic, coarse-grain view. This
                 paper presents RegionTracker, a dual-grain, on-chip
                 cache design that exposes coarse-grain behavior while
                 maintaining block-level communication. RegionTracker
                 eliminates the extraneous, often imprecise coarse-grain
                 tracking structures of previous proposals. It can be
                 used as the building block for coarse-grain
                 optimizations, reducing their overall cost and easing
                 their adoption. Using full-system simulation of a
                 quad-core chip multiprocessor and commercial workloads,
                 we demonstrate that RegionTracker overcomes the
                 inefficiencies of previous coarse-grain cache designs.
                 We also demonstrate how RegionTracker boosts the
                 benefits and reduces the cost of a previously proposed
                 snoop reduction technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access patterns; Bandwidth; cache storage; Cache
                 storage; coarse-grain optimizations; coherence traffic
                 reduction; Cost function; Design optimization;
                 Explosions; Information management; Memory management;
                 Multithreading; on-chip memory hierarchy; optimising
                 compilers; Prefetching; prefetching; Proposals;
                 quad-core chip multiprocessor; RegionTracker dual-grain
                 on-chip cache design; system-on-chip",
}

@Article{Abdulla:2008:MCR,
  author =       "Parosh Aziz Abdulla and Fr{\'e}d{\'e}ric Haziza and
                 Mats Kindahl",
  title =        "Model checking race-freeness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "72--79",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556454",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the introduction of highly concurrent systems in
                 standard desktop computers, ensuring correctness of
                 industrial-size concurrent programs is becoming
                 increasingly important. One of the most important
                 standards in use for developing multi-threaded programs
                 is the POSIX Threads standard, commonly known as
                 PThreads. Of particular importance, the analysis of
                 industrial code should, as far as possible, be
                 automatic and not require annotations or other forms of
                 specifications of the code.\par

                 Model checking has been one of the most successful
                 approaches to program verification during the last two
                 decades. The size and complexity of applications which
                 can be handled have increased rapidly through
                 integration with symbolic techniques. These methods are
                 designed to work on finite (but large) state spaces.
                 This framework fails to deal with several essential
                 aspects of behaviours for multithreaded programs: there
                 is no bound a priori on the number of threads which may
                 arise in a given run of the system; each thread
                 manipulates local variables which often range over
                 unbounded domains; and the system has a dynamic
                 structure in the sense that threads can be created and
                 killed throughout execution of the system. In this
                 paper we concentrate on checking a particular class of
                 properties for concurrent programs, namely safety
                 properties. In particular, we focus on race-freeness,
                 that is, the absence of race conditions (also known as
                 data races) in shared-variable pthreaded
                 programs.\par

                 We will follow a particular methodology which we have
                 earlier developed for model checking general classes of
                 infinite-state systems [1, 3, 6, 8, 9] and apply a
                 symbolic backward reachability analysis to verify the
                 safety property. Since we construct a model as an
                 over-approximation of the original program, proving the
                 safety property in the model implies that the property
                 also holds in the original system. Surprisingly, it
                 leads to a quite efficient analysis which can be
                 carried out fully automatically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Abraham:2008:DPS,
  author =       "Erika {\'A}brah{\'a}m and Frank S. de Boer and
                 Willem-Paul de Roever and Martin Steffen",
  title =        "A Deductive Proof System for Multithreaded {Java} with
                 Exceptions",
  journal =      j-FUND-INFO,
  volume =       "82",
  number =       "4",
  pages =        "391--463",
  month =        jul,
  year =         "2008",
  CODEN =        "FUMAAJ",
  ISSN =         "0169-2968 (print), 1875-8681 (electronic)",
  ISSN-L =       "0169-2968",
  bibdate =      "Sat Mar 5 17:06:39 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fundinfo2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Fundamenta Informaticae",
  journal-URL =  "http://content.iospress.com/journals/fundamenta-informaticae",
}

@Article{Adams:2008:ENE,
  author =       "Michael D. Adams and R. Kent Dybvig",
  title =        "Efficient nondestructive equality checking for trees
                 and graphs",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "9",
  pages =        "179--188",
  month =        sep,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1411203.1411230",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Sep 23 17:31:25 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The Revised$^6$ Report on Scheme requires its generic
                 equivalence predicate, equal?, to terminate even on
                 cyclic inputs. While the terminating equal? can be
                 implemented via a DFA-equivalence or union-find
                 algorithm, these algorithms usually require an
                 additional pointer to be stored in each object, are not
                 suitable for multithreaded code due to their
                 destructive nature, and may be unacceptably slow for
                 the small acyclic values that are the most likely
                 inputs to the predicate.\par

                 This paper presents a variant of the union-find
                 algorithm for equal? that addresses these issues. It
                 performs well on large and small, cyclic and acyclic
                 inputs by interleaving a low-overhead algorithm that
                 terminates only for acyclic inputs with a more general
                 algorithm that handles cyclic inputs. The algorithm
                 terminates for all inputs while never being more than a
                 small factor slower than whichever of the acyclic or
                 union-find algorithms would have been faster. Several
                 intermediate algorithms are also presented, each of
                 which might be suitable for use in a particular
                 application, though only the final algorithm is
                 suitable for use in a library procedure, like equal?,
                 that must work acceptably well for all inputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "dfa equivalence; eq hash tables; equality; scheme;
                 union-find",
}

@Article{Agrawal:2008:AWS,
  author =       "Kunal Agrawal and Charles E. Leiserson and Yuxiong He
                 and Wen Jing Hsu",
  title =        "Adaptive work-stealing with parallelism feedback",
  journal =      j-TOCS,
  volume =       "26",
  number =       "3",
  pages =        "7:1--7:32",
  month =        sep,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1394441.1394443",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Sep 17 14:28:13 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Multiprocessor scheduling in a shared multiprogramming
                 environment can be structured as two-level scheduling,
                 where a kernel-level job scheduler allots processors to
                 jobs and a user-level thread scheduler schedules the
                 work of a job on its allotted processors. We present a
                 randomized work-stealing thread scheduler for fork-join
                 multithreaded jobs that provides continual parallelism
                 feedback to the job scheduler in the form of requests
                 for processors. Our A-STEAL algorithm is appropriate
                 for large parallel servers where many jobs share a
                 common multiprocessor resource and in which the number
                 of processors available to a particular job may vary
                 during the job's execution. Assuming that the job
                 scheduler never allots a job more processors than
                 requested by the job's thread scheduler, A-STEAL
                 guarantees that the job completes in near-optimal time
                 while utilizing at least a constant fraction of the
                 allotted processors.\par

                 We model the job scheduler as the thread scheduler's
                 adversary, challenging the thread scheduler to be
                 robust to the operating environment as well as to the
                 job scheduler's administrative policies. For example,
                 the job scheduler might make a large number of
                 processors available exactly when the job has little
                 use for them. To analyze the performance of our
                 adaptive thread scheduler under this stringent
                 adversarial assumption, we introduce a new technique
                 called {\em trim analysis,\/} which allows us to prove
                 that our thread scheduler performs poorly on no more
                 than a small number of time steps, exhibiting
                 near-optimal behavior on the vast majority.\par

                 More precisely, suppose that a job has work $ T_1 $ and
                 span $ T_\infty $. On a machine with $P$ processors,
                 A-STEAL completes the job in an expected duration of $
                 O(T_1 / \tilde {P} + T_\infty + L \lg P)$ time steps,
                 where $L$ is the length of a scheduling quantum, and $
                 \tilde {P}$ denotes the $ O(T_\infty + L \lg
                 P)$-trimmed availability. This quantity is the average
                 of the processor availability over all time steps
                 except the $ O(T_\infty + L \lg P)$ time steps that
                 have the highest processor availability. When the job's
                 parallelism dominates the trimmed availability, that
                 is, $ \tilde {P} \ll T_1 / T_\infty $, the job achieves
                 nearly perfect linear speedup. Conversely, when the
                 trimmed mean dominates the parallelism, the asymptotic
                 running time of the job is nearly the length of its
                 span, which is optimal.\par

                 We measured the performance of A-STEAL on a simulated
                 multiprocessor system using synthetic workloads. For
                 jobs with sufficient parallelism, our experiments
                 confirm that A-STEAL provides almost perfect linear
                 speedup across a variety of processor availability
                 profiles. We compared A-STEAL with the ABP algorithm,
                 an adaptive work-stealing thread scheduler developed by
                 Arora et al. [1998] which does not employ parallelism
                 feedback. On moderately to heavily loaded machines with
                 large numbers of processors, A-STEAL typically
                 completed jobs more than twice as quickly as ABP,
                 despite being allotted the same number or fewer
                 processors on every step, while wasting only 10\% of
                 the processor cycles wasted by ABP.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "adaptive scheduling; adversary; instantaneous
                 parallelism; job scheduling; multiprocessing;
                 multiprogramming; parallel computation; parallelism
                 feedback; processor allocation; randomized algorithm;
                 space sharing; span; thread scheduling; trim analysis;
                 two-level scheduling; work; work-stealing",
}

@Article{Anderson:2008:SCD,
  author =       "Zachary Anderson and David Gay and Rob Ennals and Eric
                 Brewer",
  title =        "{SharC}: checking data sharing strategies for
                 multithreaded {C}",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "149--158",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1379022.1375600",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Mar 11 17:33:54 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Unintended or unmediated data sharing is a frequent
                 cause of insidious bugs in multithreaded programs. We
                 present a tool called SharC (short for Sharing Checker)
                 that allows a user to write lightweight annotations to
                 declare how they believe objects are being shared
                 between threads in their program. SharC uses a
                 combination of static and dynamic analyses to check
                 that the program conforms to this
                 specification.\par

                 SharC allows any type to have one of five 'sharing
                 modes' -- private to the current thread, read-only,
                 shared under the control of a specified lock,
                 intentionally racy, or checked dynamically. The dynamic
                 mode uses run-time checking to verify that objects are
                 either read-only, or only accessed by one thread. This
                 allows us to check programs that would be difficult to
                 check with a purely static system. If the user does not
                 give a type an explicit annotation, then SharC uses a
                 static type-qualifier analysis to infer that it is
                 either private or should be checked
                 dynamically.\par

                 SharC allows objects to move between different sharing
                 modes at runtime by using reference counting to check
                 that there are no other references to the objects when
                 they change mode.\par

                 SharC's baseline dynamic analysis can check any C
                 program, but is slow, and will generate false warnings
                 about intentional data sharing. As the user adds more
                 annotations, false warnings are reduced, and
                 performance improves. We have found in practice that
                 very few annotations are needed to describe all sharing
                 and give reasonable performance. We ran SharC on 6
                 legacy C programs, summing to over 600k lines of code,
                 and found that a total of only 60 simple annotations
                 were needed to remove all false positives and to reduce
                 performance overhead to only 2-14\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "data-race",
}

@Article{Athanasaki:2008:EPL,
  author =       "Evangelia Athanasaki and Nikos Anastopoulos and
                 Kornilios Kourtis and Nectarios Koziris",
  title =        "Exploring the performance limits of simultaneous
                 multithreading for memory intensive applications",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "44",
  number =       "1",
  pages =        "64--97",
  month =        apr,
  year =         "2008",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-007-0149-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Jul 9 17:32:34 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=44&issue=1;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=44&issue=1&spage=64",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
  keywords =     "Instruction-level parallelism; Performance analysis;
                 Simultaneous multithreading; Software prefetching;
                 Speculative precomputation; Thread-level parallelism",
}

@Article{Auerbach:2008:FTG,
  author =       "Joshua Auerbach and David F. Bacon and Rachid
                 Guerraoui and Jesper Honig Spring and Jan Vitek",
  title =        "Flexible task graphs: a unified restricted thread
                 programming model for {Java}",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "7",
  pages =        "1--11",
  month =        jul,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1375657.1375659",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:05:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The disadvantages of unconstrained shared-memory
                 multi-threading in Java, especially with regard to
                 latency and determinism in realtime systems, have given
                 rise to a variety of language extensions that place
                 restrictions on how threads allocate, share, and
                 communicate memory, leading to order-of-magnitude
                 reductions in latency and jitter. However, each model
                 makes different trade-offs with respect to
                 expressiveness, efficiency, enforcement, and latency,
                 and no one model is best for all applications.\par

                 In this paper we present Flexible Task Graphs
                 (Flexotasks), a single system that allows different
                 isolation policies and mechanisms to be combined in an
                 orthogonal manner, subsuming four previously proposed
                 models as well as making it possible to use new
                 combinations best suited to the needs of particular
                 applications. We evaluate our implementation on top of
                 the IBM Web-Sphere Real Time Java virtual machine using
                 both a microbenchmark and a 30 KLOC avionics collision
                 detector. We show that Flexotasks are capable of
                 executing periodic threads at 10 KHz with a standard
                 deviation of 1.2$ \mu $ s and that it achieves
                 significantly better performance than RTSJ's scoped
                 memory constructs while remaining impervious to
                 interference from global garbage collection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Java Virtual Machine; memory management; ownership
                 types; real-time systems",
}

@Article{Bahmann:2008:EFK,
  author =       "Helge Bahmann and Konrad Froitzheim",
  title =        "Extending futex for kernel to user notification",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "5",
  pages =        "18--26",
  month =        jul,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1400097.1400100",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Wed Aug 6 16:54:12 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Threads in reactive applications need to service a
                 multitude of events from different sources such as
                 device drivers, communication channels or cooperating
                 threads. While notification about these events can
                 conceptually be understood as a form of
                 'synchronization', most operating systems (including
                 Linux) do not provide a unified abstraction. This paper
                 proposes to separate event delivery and notification,
                 and to provide unified event notification through
                 general-purpose synchronization objects. It
                 demonstrates how this unified mechanism can be
                 implemented in Linux as an extension of the futex
                 mechanism to allow notification from kernel-space.
                 Required modifications are discussed and their impact
                 is assessed. The new event notification mechanism
                 allows to move many thread activation policy decisions
                 into user-space, with benefits for multi-threaded
                 reactive applications: This is demonstrated in a
                 modification of the leader/followers pattern with
                 considerable performance benefits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "event notification; followers; futex; leader {\&}
                 synchronization",
}

@Article{Boehm:2008:FCC,
  author =       "Hans-J. Boehm and Sarita V. Adve",
  title =        "Foundations of the {C++} concurrency memory model",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "68--78",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1379022.1375591",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Currently multi-threaded C or C++ programs combine a
                 single-threaded programming language with a separate
                 threads library. This is not entirely sound [7].\par

                 We describe an effort, currently nearing completion, to
                 address these issues by explicitly providing semantics
                 for threads in the next revision of the C++ standard.
                 Our approach is similar to that recently followed by
                 Java [25], in that, at least for a well-defined and
                 interesting subset of the language, we give
                 sequentially consistent semantics to programs that do
                 not contain data races. Nonetheless, a number of our
                 decisions are often surprising even to those familiar
                 with the Java effort:\par

                 We (mostly) insist on sequential consistency for
                 race-free programs, in spite of implementation issues
                 that came to light after the Java work.\par

                 We give no semantics to programs with data races. There
                 are no benign C++ data races.\par

                 We use weaker semantics for trylock than existing
                 languages or libraries, allowing us to promise
                 sequential consistency with an intuitive race
                 definition, even for programs with trylock.\par

                 This paper describes the simple model we would like to
                 be able to provide for C++ threads programmers, and
                 explain how this, together with some practical, but
                 often under-appreciated implementation constraints,
                 drives us towards the above decisions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "c++; data race; memory consistency; memory model;
                 sequential consistency; trylock",
}

@Article{Boneti:2008:SCP,
  author =       "Carlos Boneti and Francisco J. Cazorla and Roberto
                 Gioiosa and Alper Buyuktosunoglu and Chen-Yong Cher and
                 Mateo Valero",
  title =        "Software-Controlled Priority Characterization of
                 {POWER5} Processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "415--426",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.8",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Due to the limitations of instruction-level
                 parallelism, thread-level parallelism has become a
                 popular way to improve processor performance. One
                 example is the IBM POWER5TM processor, a two-context
                 simultaneous-multithreaded dual-core chip. In each SMT
                 core, the IBM POWER5 features two levels of thread
                 resource balancing and prioritization. The first level
                 provides automatic in-hardware resource balancing,
                 while the second level is a software-controlled
                 priority mechanism that presents eight levels of thread
                 priorities. Currently, software-controlled
                 prioritization is only used in limited number of cases
                 in the software platforms due to lack of performance
                 characterization of the effects of this mechanism. In
                 this work, we characterize the effects of the
                 software-based prioritization on several different
                 workloads. We show that the impact of the
                 prioritization significantly depends on the workloads
                 coscheduled on a core. By prioritizing the right task,
                 it is possible to obtain more than two times of
                 throughput improvement for synthetic workloads compared
                 to the baseline. We also present two application case
                 studies targeting two different performance metrics:
                 the first case study improves overall throughput by
                 23.7\% and the second case study reduces the total
                 execution time by 9.3\%. In addition, we show the
                 circumstances when a background thread can be run
                 transparently without affecting the performance of the
                 foreground thread.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "IBM POWER5; performance characterization; simultaneous
                 multithreading; SMT; software-controlled
                 prioritization",
}

@Article{Campanoni:2008:PDC,
  author =       "Simone Campanoni and Giovanni Agosta and Stefano
                 Crespi Reghizzi",
  title =        "A parallel dynamic compiler for {CIL} bytecode",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "4",
  pages =        "11--20",
  month =        apr,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1374752.1374754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:46 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multi-core technology is being employed in most recent
                 high-performance architectures. Such architectures need
                 specifically designed multi-threaded software to
                 exploit all the potentialities of their hardware
                 parallelism.\par

                 At the same time, object code virtualization
                 technologies are achieving a growing popularity, as
                 they allow higher levels of software portability and
                 reuse.\par

                 Thus, a virtual execution environment running on a
                 multi-core processor has to run complex, high-level
                 applications and to exploit as much as possible the
                 underlying parallel hardware. We propose an approach
                 that leverages on CMP features to expose a novel
                 pipeline synchronization model for the internal threads
                 of the dynamic compiler.\par

                 Thanks to compilation latency masking effect of the
                 pipeline organization, our dynamic compiler, ILDJIT, is
                 able to achieve significant speedups (26\% on average)
                 with respect to the baseline, when the underlying
                 hardware exposes at least two cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "dynamic compilation; parallel virtual machine; virtual
                 execution system",
}

@Article{Choi:2008:ABP,
  author =       "Bumyong Choi and Leo Porter and Dean M. Tullsen",
  title =        "Accurate branch prediction for short threads",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "2",
  pages =        "125--134",
  month =        mar,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1353534.1346298",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:20:12 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/opersysrev.bib",
  abstract =     "Multi-core processors, with low communication costs
                 and high availability of execution cores, will increase
                 the use of execution and compilation models that use
                 short threads to expose parallelism. Current branch
                 predictors seek to incorporate large amounts of control
                 flow history to maximize accuracy. However, when that
                 history is absent the predictor fails to work as
                 intended. Thus, modern predictors are almost useless
                 for threads below a certain length.\par

                 Using a Speculative Multithreaded (SpMT) architecture
                 as an example of a system which generates shorter
                 threads, this work examines techniques to improve
                 branch prediction accuracy when a new thread begins to
                 execute on a different core. This paper proposes a
                 minor change to the branch predictor that gives
                 virtually the same performance on short threads as an
                 idealized predictor that incorporates unknowable
                 pre-history of a spawned speculative thread. At the
                 same time, strong performance on long threads is
                 preserved. The proposed technique sets the global
                 history register of the spawned thread to the initial
                 value of the program counter. This novel and simple
                 design reduces branch mispredicts by 29\% and provides
                 as much as a 13\% IPC improvement on selected SPEC2000
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "branch prediction; chip multiprocessors",
}

@Article{Chugh:2008:DAC,
  author =       "Ravi Chugh and Jan W. Voung and Ranjit Jhala and Sorin
                 Lerner",
  title =        "Dataflow analysis for concurrent programs using
                 datarace detection",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "316--326",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1375581.1375620",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Dataflow analyses for concurrent programs differ from
                 their single-threaded counterparts in that they must
                 account for shared memory locations being overwritten
                 by concurrent threads. Existing dataflow analysis
                 techniques for concurrent programs typically fall at
                 either end of a spectrum: at one end, the analysis
                 conservatively kills facts about all data that might
                 possibly be shared by multiple threads; at the other
                 end, a precise thread-interleaving analysis determines
                 which data may be shared, and thus which dataflow facts
                 must be invalidated. The former approach can suffer
                 from imprecision, whereas the latter does not
                 scale.\par

                 We present RADAR, a framework that automatically
                 converts a dataflow analysis for sequential programs
                 into one that is correct for concurrent programs. RADAR
                 uses a race detection engine to kill the dataflow
                 facts, generated and propagated by the sequential
                 analysis, that become invalid due to concurrent writes.
                 Our approach of factoring all reasoning about
                 concurrency into a race detection engine yields two
                 benefits. First, to obtain analyses for code using new
                 concurrency constructs, one need only design a suitable
                 race detection engine for the constructs. Second, it
                 gives analysis designers an easy way to tune the
                 scalability and precision of the overall analysis by
                 only modifying the race detection engine. We describe
                 the RADAR framework and its implementation using a
                 pre-existing race detection engine. We show how RADAR
                 was used to generate a concurrent version of a
                 null-pointer dereference analysis, and we analyze the
                 result of running the generated concurrent analysis on
                 several benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "interprocedural analysis; locksets; multithreaded
                 programs; summaries",
}

@Article{Curtis-Maury:2008:PBP,
  author =       "Matthew Curtis-Maury and Filip Blagojevic and Christos
                 D. Antonopoulos and Dimitrios S. Nikolopoulos",
  title =        "Prediction-Based Power-Performance Adaptation of
                 Multithreaded Scientific Codes",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "19",
  number =       "10",
  pages =        "1396--1410",
  month =        oct,
  year =         "2008",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2007.70804",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu May 13 12:06:56 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Fekete:2008:TSD,
  author =       "Alan D. Fekete",
  title =        "Teaching students to develop thread-safe {Java}
                 classes",
  journal =      j-SIGCSE,
  volume =       "40",
  number =       "3",
  pages =        "119--123",
  month =        sep,
  year =         "2008",
  CODEN =        "SIGSD3",
  DOI =          "https://doi.org/10.1145/1597849.1384304",
  ISSN =         "0097-8418 (print), 2331-3927 (electronic)",
  ISSN-L =       "0097-8418",
  bibdate =      "Sat Nov 17 15:44:14 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/csharp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib",
  note =         "Proceedings of ITiCSE '08.",
  abstract =     "Concurrent programming was once the preserve of
                 experts writing systems internals; but recently the
                 growing importance of application servers, and the
                 excellent support in Java and C\# for thread handling,
                 has brought threads and locking as topics that every
                 software developer might experience, and therefore
                 every computer science graduate ought to know. In this
                 paper we report on several years of experience teaching
                 this material in the early years of the curriculum. We
                 focus on one aspect of multi-threaded code, namely how
                 to write sensible thread-safe classes. We identify the
                 learning outcomes we aim to deliver, and we discuss the
                 main pedagogic difficulties students find. We present
                 some examples that can help students avoid common
                 erroneous views.",
  acknowledgement = ack-nhfb,
  fjournal =     "SIGCSE Bulletin (ACM Special Interest Group on
                 Computer Science Education)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J688",
}

@Article{Fide:2008:PUS,
  author =       "S. Fide and S. Jenks",
  title =        "Proactive Use of Shared {L3} Caches to Enhance Cache
                 Communications in Multi-Core Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "57--60",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The software and hardware techniques to exploit the
                 potential of multi-core processors are falling behind,
                 even though the number of cores and cache levels per
                 chip is increasing rapidly. There is no explicit
                 communications support available, and hence inter-core
                 communications depend on cache coherence protocols,
                 resulting in demand-based cache line transfers with
                 their inherent latency and overhead. In this paper, we
                 present software controlled eviction (SCE) to improve
                 the performance of multithreaded applications running
                 on multi-core processors by moving shared data to
                 shared cache levels before it is demanded from remote
                 private caches. Simulation results show that SCE offers
                 significant performance improvement (8-28\%) and
                 reduces L3 cache misses by 88-98\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence protocol; cache communication; cache
                 storage; Concurrent computing; Control systems;
                 Degradation; Delay; demand-based cache line transfer;
                 Hardware; intercore communications; microprocessor
                 chips; Multi-core/single-chip multiprocessors;
                 multi-threading; Multicore processing; multicore
                 processors; multithreaded application; Parallel
                 processing; Protocols; shared L3 cache; shared memory
                 systems; software controlled eviction; Software
                 performance; Support for multi-threaded execution",
}

@Article{Flanagan:2008:ADA,
  author =       "Cormac Flanagan and Stephen N. Freund",
  title =        "{Atomizer}: a dynamic atomicity checker for
                 multithreaded programs",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "71",
  number =       "2",
  pages =        "89--109",
  day =          "1",
  month =        apr,
  year =         "2008",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Fri Apr 1 18:39:19 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/01676423",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
}

@Article{Flanagan:2008:TAS,
  author =       "Cormac Flanagan and Stephen N. Freund and Marina
                 Lifshin and Shaz Qadeer",
  title =        "Types for atomicity: {Static} checking and inference
                 for {Java}",
  journal =      j-TOPLAS,
  volume =       "30",
  number =       "4",
  pages =        "20:1--20:52",
  month =        jul,
  year =         "2008",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1377492.1377495",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Aug 5 19:14:53 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Atomicity is a fundamental correctness property in
                 multithreaded programs. A method is atomic if, for
                 every execution, there is an equivalent serial
                 execution in which the actions of the method are not
                 interleaved with actions of other threads. Atomic
                 methods are amenable to sequential reasoning, which
                 significantly facilitates subsequent analysis and
                 verification.\par

                 This article presents a type system for specifying and
                 verifying the atomicity of methods in multithreaded
                 Java programs using a synthesis of Lipton's theory of
                 reduction and type systems for race detection. The type
                 system supports guarded, write-guarded, and unguarded
                 fields, as well as thread-local data, parameterized
                 classes and methods, and protected locks. We also
                 present an algorithm for verifying atomicity via type
                 inference.\par

                 We have applied our type checker and type inference
                 tools to a number of commonly used Java library classes
                 and programs. These tools were able to verify the vast
                 majority of methods in these benchmarks as atomic,
                 indicating that atomicity is a widespread methodology
                 for multithreaded programming. In addition, reported
                 atomicity violations revealed some subtle errors in the
                 synchronization disciplines of these programs.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "Atomicity; concurrent programs; type inference; type
                 systems",
}

@Article{Flanagan:2008:VSC,
  author =       "Cormac Flanagan and Stephen N. Freund and Jaeheon Yi",
  title =        "{Velodrome}: a sound and complete dynamic atomicity
                 checker for multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "293--303",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1375581.1375618",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Atomicity is a fundamental correctness property in
                 multithreaded programs, both because atomic code blocks
                 are amenable to sequential reasoning (which
                 significantly simplifies correctness arguments), and
                 because atomicity violations often reveal defects in a
                 program's synchronization structure. Unfortunately, all
                 atomicity analyses developed to date are incomplete in
                 that they may yield false alarms on correctly
                 synchronized programs, which limits their
                 usefulness.\par

                 We present the first dynamic analysis for atomicity
                 that is both sound and complete. The analysis reasons
                 about the exact dependencies between operations in the
                 observed trace of the target program, and it reports
                 error messages if and only if the observed trace is not
                 conflict-serializable. Despite this significant
                 increase in precision, the performance and coverage of
                 our analysis is competitive with earlier incomplete
                 dynamic analyses for atomicity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "atomicity; dynamic analysis; serializability",
}

@Article{Gidenstam:2008:LLF,
  author =       "Anders Gidenstam and Marina Papatriantafilou",
  title =        "{LFTHREADS}: a lock-free thread library",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "88--92",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556456",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This extended abstract presents LFTHREADS, a thread
                 library entirely based on lock-free methods, i.e. no
                 spinlocks or similar synchronization mechanisms are
                 employed in the implementation of the multithreading.
                 Since lockfreedom is highly desirable in
                 multiprocessors/multicores due to its advantages in
                 parallelism, fault-tolerance, convoy-avoidance and
                 more, there is an increased demand in lock-free methods
                 in parallel applications, hence also in
                 multiprocessor/multicore system services. LFTHREADS is
                 the first thread library that provides a lock-free
                 implementation of blocking synchronization primitives
                 for application threads; although the latter may sound
                 like a contradicting goal, such objects have several
                 benefits: e.g. library operations that block and
                 unblock threads on the same synchronization object can
                 make progress in parallel while maintaining the desired
                 thread-level semantics and without having to wait for
                 any 'low' operations among them. Besides, as no
                 spin-locks or similar synchronization mechanisms are
                 employed, memory contention can be reduced and
                 processors/cores are able to do useful work. As a
                 consequence, applications, too, can enjoy enhanced
                 parallelism and fault-tolerance. For the
                 synchronization in LFTHREADS we have introduced a new
                 method, which we call responsibility hand-off (RHO),
                 that does not need any special kernel support. The RHO
                 method is also of independent interest, as it can also
                 serve as a tool for lock-free token passing, management
                 of contention and interaction between scheduling and
                 synchronization. This paper gives an outline and the
                 context of LFTHREADS. For more details the reader is
                 referred to [7] and [8].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Gravvanis:2008:JMB,
  author =       "George A. Gravvanis and Victor N. Epitropou",
  title =        "{Java} multithreading-based parallel approximate
                 arrow-type inverses",
  journal =      j-CCPE,
  volume =       "20",
  number =       "10",
  pages =        "1151--1172",
  month =        jul,
  year =         "2008",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1262",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:25 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "18 Sep 2007",
}

@Article{Hassanein:2008:AEH,
  author =       "Wessam M. Hassanein and Layali K. Rashid and Moustafa
                 A. Hammad",
  title =        "Analyzing the Effects of Hyperthreading on the
                 Performance of Data Management Systems",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "2",
  pages =        "206--225",
  month =        apr,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0066-x",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:03 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=206",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Data management systems; Databases; Hyper-threaded
                 architectures; Performance; Simultaneous
                 multithreading",
}

@Article{He:2008:COD,
  author =       "Bingsheng He and Qiong Luo",
  title =        "Cache-oblivious databases: {Limitations} and
                 opportunities",
  journal =      j-TODS,
  volume =       "33",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2008",
  CODEN =        "ATDSD3",
  DOI =          "https://doi.org/10.1145/1366102.1366105",
  ISSN =         "0362-5915 (print), 1557-4644 (electronic)",
  ISSN-L =       "0362-5915",
  bibdate =      "Wed Jun 25 08:39:17 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tods/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Cache-oblivious techniques, proposed in the theory
                 community, have optimal asymptotic bounds on the amount
                 of data transferred between any two adjacent levels of
                 an arbitrary memory hierarchy. Moreover, this optimal
                 performance is achieved without any hardware platform
                 specific tuning. These properties are highly attractive
                 to autonomous databases, especially because the
                 hardware architectures are becoming increasingly
                 complex and diverse.\par

                 In this article, we present our design, implementation,
                 and evaluation of the first cache-oblivious in-memory
                 query processor, EaseDB. Moreover, we discuss the
                 inherent limitations of the cache-oblivious approach as
                 well as the opportunities given by the upcoming
                 hardware architectures. Specifically, a cache-oblivious
                 technique usually requires sophisticated algorithm
                 design to achieve a comparable performance to its
                 cache-conscious counterpart. Nevertheless, this
                 development-time effort is compensated by the
                 automaticity of performance achievement and the reduced
                 ownership cost. Furthermore, this automaticity enables
                 cache-oblivious techniques to outperform their
                 cache-conscious counterparts in multi-threading
                 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Database Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J777",
  keywords =     "cache-conscious; cache-oblivious; chip
                 multiprocessors; data caches; simultaneous
                 multithreading",
}

@Article{Jacobs:2008:PMC,
  author =       "Bart Jacobs and Frank Piessens and Jan Smans and K.
                 Rustan M. Leino and Wolfram Schulte",
  title =        "A programming model for concurrent object-oriented
                 programs",
  journal =      j-TOPLAS,
  volume =       "31",
  number =       "1",
  pages =        "1:1--1:48",
  month =        dec,
  year =         "2008",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1452044.1452045",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Dec 23 11:52:52 MST 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Reasoning about multithreaded object-oriented programs
                 is difficult, due to the nonlocal nature of object
                 aliasing and data races. We propose a programming
                 regime (or {\em programming model\/}) that rules out
                 data races, and enables local reasoning in the presence
                 of object aliasing and concurrency. Our programming
                 model builds on the multithreading and synchronization
                 primitives as they are present in current mainstream
                 programming languages. Java or C\\# programs developed
                 according to our model can be annotated by means of
                 stylized comments to make the use of the model
                 explicit. We show that such annotated programs can be
                 formally verified to comply with the programming model.
                 If the annotated program verifies, the underlying Java
                 or C\\# program is guaranteed to be free from data
                 races, and it is sound to reason locally about program
                 behavior. Verification is modular: a program is valid
                 if all methods are valid, and validity of a method does
                 not depend on program elements that are not visible to
                 the method. We have implemented a verifier for programs
                 developed according to our model in a custom build of
                 the Spec\\# programming system, and we have validated
                 our approach on a case study.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "Aliasing; data races; local reasoning; modular
                 reasoning; ownership; verification condition
                 generation",
}

@Article{Jaisson:2008:IPM,
  author =       "Pascal Jaisson and Florian {De Vuyst}",
  title =        "An innovating {PDE} model based on fluid flow paradigm
                 for multithread systems",
  journal =      j-COMP-NET-AMSTERDAM,
  volume =       "52",
  number =       "18",
  pages =        "3318--3324",
  day =          "22",
  month =        dec,
  year =         "2008",
  CODEN =        "????",
  ISSN =         "1389-1286 (print), 1872-7069 (electronic)",
  ISSN-L =       "1389-1286",
  bibdate =      "Sat Apr 2 08:42:29 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/13891286",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Networks (Amsterdam, Netherlands: 1999)",
  journal-URL =  "http://www.sciencedirect.com/science/journal/13891286",
}

@Article{Kang:2008:ISE,
  author =       "Dongsoo Kang and Chen Liu and Jean-Luc Gaudiot",
  title =        "The Impact of Speculative Execution on {SMT}
                 Processors",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "4",
  pages =        "361--385",
  month =        aug,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0052-3",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:14 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=4;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=4&spage=361",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Confidence estimator; Simultaneous multithreading;
                 Speculation control; Thread scheduling",
}

@Article{Kgil:2008:PUS,
  author =       "Taeho Kgil and Ali Saidi and Nathan Binkert and Steve
                 Reinhardt and Krisztian Flautner and Trevor Mudge",
  title =        "{PicoServer}: {Using} {$3$D} stacking technology to
                 build energy efficient servers",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412589",
  ISSN =         "1550-4832",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This article extends our prior work to show that a
                 straightforward use of 3D stacking technology enables
                 the design of compact energy-efficient servers. Our
                 proposed architecture, called PicoServer, employs 3D
                 technology to bond one die containing several simple,
                 slow processing cores to multiple memory dies
                 sufficient for a primary memory. The multiple memory
                 dies are composed of DRAM. This use of 3D stacks
                 readily facilitates wide low-latency buses between
                 processors and memory. These remove the need for an L2
                 cache allowing its area to be re-allocated to
                 additional simple cores. The additional cores allow the
                 clock frequency to be lowered without impairing
                 throughput. Lower clock frequency means that thermal
                 constraints, a concern with 3D stacking, are easily
                 satisfied. We extend our original analysis on
                 PicoServer to include: (1) a wider set of server
                 workloads, (2) the impact of multithreading, and (3)
                 the on-chip DRAM architecture and system memory usage.
                 PicoServer is intentionally simple, requiring only the
                 simplest form of 3D technology where die are stacked on
                 top of one another. Our intent is to minimize risk of
                 introducing a new technology (3D) to implement a class
                 of low-cost, low-power compact server architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D stacking technology; chip multiprocessor;
                 full-system simulation; Low power; Tier-1/2/3 server",
}

@Article{Krashinsky:2008:ISV,
  author =       "Ronny Krashinsky and Christopher Batten and Krste
                 Asanovi{\'c}",
  title =        "Implementing the {Scale} vector-thread processor",
  journal =      j-TODAES,
  volume =       "13",
  number =       "3",
  pages =        "41:1--41:??",
  month =        jul,
  year =         "2008",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1367045.1367050",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Tue Aug 5 18:41:27 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The Scale vector-thread processor is a
                 complexity-effective solution for embedded computing
                 which flexibly supports both vector and highly
                 multithreaded processing. The 7.1-million transistor
                 chip has 16 decoupled execution clusters, vector load
                 and store units, and a nonblocking 32KB cache. An
                 automated and iterative design and verification flow
                 enabled a performance-, power-, and area-efficient
                 implementation with two person-years of development
                 effort. Scale has a core area of 16.6 mm$^2$ in 180 nm
                 technology, and it consumes 400 mW--1.1 W while running
                 at 260 MHz.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems (TODAES)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
  keywords =     "hybrid C++/Verilog simulation; iterative VLSI design
                 flow; multithreaded processors; procedural datapath
                 pre-placement; vector processors; vector-thread
                 processors",
}

@Article{Kumar:2008:AVO,
  author =       "Sanjeev Kumar and Daehyun Kim and Mikhail Smelyanskiy
                 and Yen-Kuang Chen and Jatin Chhugani and Christopher
                 J. Hughes and Changkyu Kim and Victor W. Lee and
                 Anthony D. Nguyen",
  title =        "Atomic Vector Operations on Chip Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "441--452",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382154",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The current trend is for processors to deliver
                 dramatic improvements in parallel performance while
                 only modestly improving serial performance. Parallel
                 performance is harvested through vector/SIMD
                 instructions as well as multithreading (through both
                 multithreaded cores and chip multiprocessors). Vector
                 parallelism can be more efficiently supported than
                 multithreading, but is often harder for software to
                 exploit. In particular, code with sparse data access
                 patterns cannot easily utilize the vector/SIMD
                 instructions of mainstream processors. Hardware to
                 scatter and gather sparse data has previously been
                 proposed to enable vector execution for these codes.
                 However, on multithreaded architectures, a number of
                 applications spend significant time on atomic
                 operations (e.g., parallel reductions), which cannot be
                 vectorized using previously proposed schemes. This
                 paper proposes architectural support for atomic vector
                 operations (referred to as GLSC) that addresses this
                 limitation. GLSC extends scatter-gather hardware to
                 support atomic memory operations. Our experiments show
                 that the GLSC provides an average performance
                 improvement on a set of important RMS kernels of 54\%
                 for 4-wide SIMD.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "locks; multiprocessors; reductions; SIMD; vector",
}

@Article{Li:2008:TAN,
  author =       "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun",
  title =        "Transaction-Aware Network-on-Chip Resource
                 Reservation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "53--56",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Performance and scalability are critically-important
                 for on-chip interconnect in many-core
                 chip-multiprocessor systems. Packet-switched
                 interconnect fabric, widely viewed as the de facto
                 on-chip data communication backplane in the many-core
                 era, offers high throughput and excellent scalability.
                 However, these benefits come at the price of router
                 latency due to run-time multi-hop data buffering and
                 resource arbitration. The network accounts for a
                 majority of on-chip data transaction latency. In this
                 work, we propose dynamic in-network resource
                 reservation techniques to optimize run-time on-chip
                 data transactions. This idea is motivated by the need
                 to preserve existing abstraction and general-purpose
                 network performance while optimizing for
                 frequently-occurring network events such as data
                 transactions. Experimental studies using multithreaded
                 benchmarks demonstrate that the proposed techniques can
                 reduce on-chip data access latency by 28.4\% on average
                 in a 16-node system and 29.2\% on average in a 36-node
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Backplanes; buffer storage; Computer buffers; data
                 communication; Data communication; de facto on-chip
                 data communication backplane; Delay; dynamic in-network
                 resource reservation techniques; Fabrics;
                 frequently-occurring network events; Interconnection
                 architectures; Interconnections (Subsystems); many-core
                 chip-multiprocessor systems; multiprocessor
                 interconnection networks; Network-on-a-chip; on-chip
                 data transaction latency; On-chip interconnection
                 networks; packet switching; packet-switched
                 interconnect fabric; Parallel Architectures; resource
                 allocation; router latency; run-time multihop data
                 buffering; Runtime; Scalability; System-on-a-chip;
                 telecommunication network routing; Throughput;
                 transaction-aware network-on-chip resource
                 reservation",
}

@Article{Liu:2008:HPP,
  author =       "Duo Liu and Zheng Chen and Bei Hua and Nenghai Yu and
                 Xinan Tang",
  title =        "High-performance packet classification algorithm for
                 multithreaded {IXP} network processor",
  journal =      j-TECS,
  volume =       "7",
  number =       "2",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331331.1331340",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jun 12 15:22:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Packet classification is crucial for the Internet to
                 provide more value-added services and guaranteed
                 quality of service. Besides hardware-based solutions,
                 many software-based classification algorithms have been
                 proposed. However, classifying at 10 Gbps speed or
                 higher is a challenging problem and it is still one of
                 the performance bottlenecks in core routers. In
                 general, classification algorithms face the same
                 challenge of balancing between high classification
                 speed and low memory requirements. This paper proposes
                 a modified recursive flow classification (RFC)
                 algorithm, Bitmap-RFC, which significantly reduces the
                 memory requirements of RFC by applying a bitmap
                 compression technique. To speed up classifying speed,
                 we exploit the multithreaded architectural features in
                 various algorithm development stages from algorithm
                 design to algorithm implementation. As a result,
                 Bitmap-RFC strikes a good balance between speed and
                 space. It can significantly keep both high
                 classification speed and reduce memory space
                 consumption. This paper investigates the main NPU
                 software design aspects that have dramatic performance
                 impacts on any NPU-based implementations: memory space
                 reduction, instruction selection, data allocation, task
                 partitioning, and latency hiding. We experiment with an
                 architecture-aware design principle to guarantee the
                 high performance of the classification algorithm on an
                 NPU implementation. The experimental results show that
                 the Bitmap-RFC algorithm achieves 10 Gbps speed or
                 higher and has a good scalability on Intel IXP2800
                 NPU.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
  keywords =     "architecture; embedded system design; multithreading;
                 network processor; packet classification; thread-level
                 parallelism",
}

@Article{Madriles:2008:MSM,
  author =       "Carlos Madriles and Carlos Garc{\'\i}a-Qui{\~n}ones
                 and Jes{\'u}s S{\'a}nchez and Pedro Marcuello and
                 Antonio Gonz{\'a}lez and Dean M. Tullsen and Hong Wang
                 and John P. Shen",
  title =        "{Mitosis}: a Speculative Multithreaded Processor Based
                 on Precomputation Slices",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "19",
  number =       "7",
  pages =        "914--925",
  month =        jul,
  year =         "2008",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2007.70797",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Jul 3 12:41:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Montesinos:2008:DRD,
  author =       "Pablo Montesinos and Luis Ceze and Josep Torrellas",
  title =        "{DeLorean}: Recording and Deterministically Replaying
                 Shared-Memory Multiprocessor Execution Efficiently",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "289--300",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.36",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Support for deterministic replay of multithreaded
                 execution can greatly help in finding concurrency bugs.
                 For highest effectiveness, replay schemes should (i)
                 record at production-run speed, (ii) keep their logging
                 requirements minute, and (iii) replay at a speed
                 similar to that of the initial execution. In this
                 paper, we propose a new substrate for deterministic
                 replay that provides substantial advances along these
                 axes. In our proposal, processors execute blocks of
                 instructions atomically, as in transactional memory or
                 speculative multithreading, and the system only needs
                 to record the commit order of these blocks. We call our
                 scheme DeLorean. Our results show that DeLorean records
                 execution at a speed similar to that of Release
                 Consistency (RC) execution and replays at about 82\% of
                 its speed. In contrast, most current schemes only
                 record at the speed of Sequential Consistency (SC)
                 execution. Moreover, DeLorean only needs 7.5\% of the
                 log size needed by a state-of-the-art scheme. Finally,
                 DeLorean can be configured to need only 0.6\% of the
                 log size of the state-of-the-art scheme at the cost of
                 recording at 86\% of RC's execution speed --- still
                 faster than SC. In this configuration, the log of an
                 8-processor 5-GHz machine is estimated to be only about
                 20GB per day.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Musuvathi:2008:FSM,
  author =       "Madanlal Musuvathi and Shaz Qadeer",
  title =        "Fair stateless model checking",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "362--371",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1379022.1375625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Stateless model checking is a useful state-space
                 exploration technique for systematically testing
                 complex real-world software. Existing stateless model
                 checkers are limited to the verification of safety
                 properties on terminating programs. However, realistic
                 concurrent programs are nonterminating, a property that
                 significantly reduces the efficacy of stateless model
                 checking in testing them. Moreover, existing stateless
                 model checkers are unable to verify that a
                 nonterminating program satisfies the important liveness
                 property of livelock-freedom, a property that requires
                 the program to make continuous progress for any
                 input.\par

                 To address these shortcomings, this paper argues for
                 incorporating a fair scheduler in stateless
                 exploration. The key contribution of this paper is an
                 explicit scheduler that is (strongly) fair and at the
                 same time sufficiently nondeterministic to guarantee
                 full coverage of safety properties. We have implemented
                 the fair scheduler in the CHESS model checker. We show
                 through theoretical arguments and empirical evaluation
                 that our algorithm satisfies two important properties:
                 (1) it visits all states of a finite-state program
                 achieving state coverage at a faster rate than existing
                 techniques, and (2) it finds all livelocks in a
                 finite-state program. Before this work, nonterminating
                 programs had to be manually modified in order to apply
                 CHESS to them. The addition of fairness has allowed
                 CHESS to be effectively applied to real-world
                 nonterminating programs without any modification. For
                 example, we have successfully booted the Singularity
                 operating system under the control of CHESS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; fairness; liveness; model checking;
                 multi-threading; shared-memory programs; software
                 testing",
}

@Article{Neamtiu:2008:CEV,
  author =       "Iulian Neamtiu and Michael Hicks and Jeffrey S. Foster
                 and Polyvios Pratikakis",
  title =        "Contextual effects for version-consistent dynamic
                 software updating all and safe concurrent programming",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "1",
  pages =        "37--49",
  month =        jan,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1328897.1328447",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:02:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper presents a generalization of standard
                 effect systems that we call contextual effects. A
                 traditional effect system computes the effect of an
                 expression e. Our system additionally computes the
                 effects of the computational context in which $e$
                 occurs. More specifically, we compute the effect of the
                 computation that has already occurred(the prior effect)
                 and the effect of the computation yet to take place
                 (the future effect).\par

                 Contextual effects are useful when the past or future
                 computation of the program is relevant at various
                 program points. We present two substantial examples.
                 First, we show how prior and future effects can be used
                 to enforce transactional version consistency (TVC), a
                 novel correctness property for dynamic software
                 updates. TV Censures that programmer-designated
                 transactional code blocks appear to execute entirely at
                 the same code version, even if a dynamic update occurs
                 in the middle of the block. Second, we show how future
                 effects can be used in the analysis of multi-threaded
                 programs to find thread-shared locations. This is an
                 essential step in applications such as data race
                 detection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "computation effects; contextual effects; data race
                 detection; dynamic software updating; type and effect
                 systems; version consistency",
}

@Article{Ottoni:2008:COGa,
  author =       "Guilherme Ottoni and David I. August",
  title =        "Communication optimizations for global multi-threaded
                 instruction scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "222--232",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346310",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The recent shift in the industry towards chip
                 multiprocessor (CMP) designs has brought the need for
                 multi-threaded applications to mainstream computing. As
                 observed in several limit studies, most of the
                 parallelization opportunities require looking for
                 parallelism beyond local regions of code. To exploit
                 these opportunities, especially for sequential
                 applications, researchers have recently proposed global
                 multi-threaded instruction scheduling techniques,
                 including DSWP and GREMIO. These techniques
                 simultaneously schedule instructions from large regions
                 of code, such as arbitrary loop nests or whole
                 procedures, and have been shown to be effective at
                 extracting threads for many applications. A key enabler
                 of these global instruction scheduling techniques is
                 the Multi-Threaded Code Generation (MTCG) algorithm
                 proposed in [16], which generates multi-threaded code
                 for any partition of the instructions into threads.
                 This algorithm inserts communication and
                 synchronization instructions in order to satisfy all
                 inter-thread dependences.\par

                 In this paper, we present a general compiler framework,
                 COCO, to optimize the communication and synchronization
                 instructions inserted by the MTCG algorithm. This
                 framework, based on thread-aware data-flow analyses and
                 graph min-cut algorithms, appropriately models and
                 optimizes all kinds of inter-thread dependences,
                 including register, memory, and control dependences.
                 Our experiments, using a fully automatic compiler
                 implementation of these techniques, demonstrate
                 significant reductions (about 30\% on average) in the
                 number of dynamic communication instructions in code
                 parallelized with DSWP and GREMIO. This reduction in
                 communication translates to performance gains of up to
                 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "communication; data-flow analysis; graph min-cut;
                 instruction scheduling; multi-threading;
                 synchronization",
}

@Article{Ottoni:2008:COGb,
  author =       "Guilherme Ottoni and David I. August",
  title =        "Communication optimizations for global multi-threaded
                 instruction scheduling",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "2",
  pages =        "222--232",
  month =        mar,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1353535.1346310",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:20:12 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The recent shift in the industry towards chip
                 multiprocessor (CMP) designs has brought the need for
                 multi-threaded applications to mainstream computing. As
                 observed in several limit studies, most of the
                 parallelization opportunities require looking for
                 parallelism beyond local regions of code. To exploit
                 these opportunities, especially for sequential
                 applications, researchers have recently proposed global
                 multi-threaded instruction scheduling techniques,
                 including DSWP and GREMIO. These techniques
                 simultaneously schedule instructions from large regions
                 of code, such as arbitrary loop nests or whole
                 procedures, and have been shown to be effective at
                 extracting threads for many applications. A key enabler
                 of these global instruction scheduling techniques is
                 the Multi-Threaded Code Generation (MTCG) algorithm
                 proposed in [16], which generates multi-threaded code
                 for any partition of the instructions into threads.
                 This algorithm inserts communication and
                 synchronization instructions in order to satisfy all
                 inter-thread dependences.\par

                 In this paper, we present a general compiler framework,
                 COCO, to optimize the communication and synchronization
                 instructions inserted by the MTCG algorithm. This
                 framework, based on thread-aware data-flow analyses and
                 graph min-cut algorithms, appropriately models and
                 optimizes all kinds of inter-thread dependences,
                 including register, memory, and control dependences.
                 Our experiments, using a fully automatic compiler
                 implementation of these techniques, demonstrate
                 significant reductions (about 30\% on average) in the
                 number of dynamic communication instructions in code
                 parallelized with DSWP and GREMIO. This reduction in
                 communication translates to performance gains of up to
                 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "communication; data-flow analysis; graph min-cut;
                 instruction scheduling; multi-threading;
                 synchronization",
}

@Article{Ottoni:2008:COGc,
  author =       "Guilherme Ottoni and David I. August",
  title =        "Communication optimizations for global multi-threaded
                 instruction scheduling",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "3",
  pages =        "222--232",
  month =        mar,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1353535.1346310",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:03:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The recent shift in the industry towards chip
                 multiprocessor (CMP) designs has brought the need for
                 multi-threaded applications to mainstream computing. As
                 observed in several limit studies, most of the
                 parallelization opportunities require looking for
                 parallelism beyond local regions of code. To exploit
                 these opportunities, especially for sequential
                 applications, researchers have recently proposed global
                 multi-threaded instruction scheduling techniques,
                 including DSWP and GREMIO. These techniques
                 simultaneously schedule instructions from large regions
                 of code, such as arbitrary loop nests or whole
                 procedures, and have been shown to be effective at
                 extracting threads for many applications. A key enabler
                 of these global instruction scheduling techniques is
                 the Multi-Threaded Code Generation (MTCG) algorithm
                 proposed in [16], which generates multi-threaded code
                 for any partition of the instructions into threads.
                 This algorithm inserts communication and
                 synchronization instructions in order to satisfy all
                 inter-thread dependences.\par

                 In this paper, we present a general compiler framework,
                 COCO, to optimize the communication and synchronization
                 instructions inserted by the MTCG algorithm. This
                 framework, based on thread-aware data-flow analyses and
                 graph min-cut algorithms, appropriately models and
                 optimizes all kinds of inter-thread dependences,
                 including register, memory, and control dependences.
                 Our experiments, using a fully automatic compiler
                 implementation of these techniques, demonstrate
                 significant reductions (about 30\% on average) in the
                 number of dynamic communication instructions in code
                 parallelized with DSWP and GREMIO. This reduction in
                 communication translates to performance gains of up to
                 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "communication; data-flow analysis; graph min-cut;
                 instruction scheduling; multi-threading;
                 synchronization",
}

@Article{Rangan:2008:PSD,
  author =       "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
                 and David I. August",
  title =        "Performance scalability of decoupled software
                 pipelining",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1400112.1400113",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Any successful solution to using multicore processors
                 to scale general-purpose program performance will have
                 to contend with rising intercore communication costs
                 while exposing coarse-grained parallelism. Recently
                 proposed pipelined multithreading (PMT) techniques have
                 been demonstrated to have general-purpose applicability
                 and are also able to effectively tolerate inter-core
                 latencies through pipelined interthread communication.
                 These desirable properties make PMT techniques strong
                 candidates for program parallelization on current and
                 future multicore processors and understanding their
                 performance characteristics is critical to their
                 deployment. To that end, this paper evaluates the
                 performance scalability of a general-purpose PMT
                 technique called decoupled software pipelining (DSWP)
                 and presents a thorough analysis of the communication
                 bottlenecks that must be overcome for optimal DSWP
                 scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "decoupled software pipelining; performance analysis",
}

@Article{Rounce:2008:DIS,
  author =       "Peter A. Rounce and Alberto F. De Souza",
  title =        "Dynamic Instruction Scheduling in a Trace-based
                 Multi-threaded Architecture",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "2",
  pages =        "184--205",
  month =        apr,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0062-1",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:03 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=184",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Dynamic instruction scheduling; Simultaneous
                 multi-threading; VLIW; Wide issue architectures",
}

@Article{Ruan:2008:DCS,
  author =       "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John
                 M. Tracey",
  title =        "Do commodity {SMT} processors need more {OS}
                 research?",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "1",
  pages =        "21--25",
  month =        jan,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1341312.1341318",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:19:29 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/opersysrev.bib",
  abstract =     "The availability of Simultaneous Multithreading (SMT)
                 in commodity processors such as the Pentium 4 (P4) has
                 raised interest among OS researchers. While earlier
                 simulation studies of SMT suggested exciting
                 performance potential, observed improvement on the P4
                 has been much more restrained, raising the hope that OS
                 research can help bridge the gap. We argue that OS
                 research for current commodity Simultaneous
                 Multithreading (SMT) processors is unlikely to yield
                 significant benefits. In general, we find that SMT
                 processor simulations were optimistic about cache and
                 memory performance characteristics, while overlooking
                 the OS overheads of SMT kernels versus uniprocessor
                 kernels. Using measurement and analysis on actual
                 hardware, we find that little opportunity exists for
                 realistic performance gains on commodity SMT beyond
                 what is currently achieved.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Schaffer:2008:UHM,
  author =       "Kevin Schaffer and Robert A. Walker",
  title =        "Using Hardware Multithreading to Overcome
                 Broadcast\slash Reduction Latency in an Associative
                 {SIMD} Processor",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "18",
  number =       "4",
  pages =        "491--509",
  month =        dec,
  year =         "2008",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626408003533",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  bibdate =      "Thu Sep 2 09:08:11 MDT 2010",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Sen:2008:RDR,
  author =       "Koushik Sen",
  title =        "Race directed random testing of concurrent programs",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "6",
  pages =        "11--21",
  month =        jun,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1379022.1375584",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:04:53 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Bugs in multi-threaded programs often arise due to
                 data races. Numerous static and dynamic program
                 analysis techniques have been proposed to detect data
                 races. We propose a novel randomized dynamic analysis
                 technique that utilizes potential data race information
                 obtained from an existing analysis tool to separate
                 real races from false races without any need for manual
                 inspection. Specifically, we use potential data race
                 information obtained from an existing dynamic analysis
                 technique to control a random scheduler of threads so
                 that real race conditions get created with very high
                 probability and those races get resolved randomly at
                 runtime. Our approach has several advantages over
                 existing dynamic analysis tools. First, we can create a
                 real race condition and resolve the race randomly to
                 see if an error can occur due to the race. Second, we
                 can replay a race revealing execution efficiently by
                 simply using the same seed for random number
                 generation--we do not need to record the execution.
                 Third, our approach has very low overhead compared to
                 other precise dynamic race detection techniques because
                 we only track all synchronization operations and a
                 single pair of memory access statements that are
                 reported to be in a potential race by an existing
                 analysis. We have implemented the technique in a
                 prototype tool for Java and have experimented on a
                 number of large multi-threaded Java programs. We report
                 a number of previously known and unknown bugs and real
                 races in these Java programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; dynamic analysis; race detection; random
                 testing",
}

@Article{Sharkey:2008:RRP,
  author =       "Joseph J. Sharkey and Jason Loew and Dmitry V.
                 Ponomarev",
  title =        "Reducing register pressure in {SMT} processors through
                 {L2}-miss-driven early register release",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1455650.1455652",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The register file is one of the most critical datapath
                 components limiting the number of threads that can be
                 supported on a simultaneous multithreading (SMT)
                 processor. To allow the use of smaller register files
                 without degrading performance, techniques that maximize
                 the efficiency of using registers through aggressive
                 register allocation/deallocation can be considered. In
                 this article, we propose a novel technique to early
                 deallocate physical registers allocated to threads
                 which experience L2 cache misses. This is accomplished
                 by speculatively committing the load-independent
                 instructions and deallocating the registers
                 corresponding to the previous mappings of their
                 destinations, without waiting for the cache miss
                 request to be serviced. The early deallocated registers
                 are then made immediately available for allocation to
                 instructions within the same thread as well as within
                 other threads, thus improving the overall processor
                 throughput. On the average across the simulated mixes
                 of multiprogrammed SPEC 2000 workloads, our technique
                 results in 33\% improvement in throughput and 25\%
                 improvement in terms of harmonic mean of weighted IPCs
                 over the baseline SMT with the state-of-the-art DCRA
                 policy. This is achieved without creating checkpoints,
                 maintaining per-register counters of pending consumers,
                 performing tag rebroadcasts, register remappings,
                 and/or additional associative searches.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "register file; Simultaneous multithreading",
}

@Article{Suleman:2008:FDTa,
  author =       "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
                 Patt",
  title =        "Feedback-driven threading: power-efficient and
                 high-performance execution of multi-threaded workloads
                 on {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "277--286",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346317",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Extracting high-performance from the emerging Chip
                 Multiprocessors (CMPs) requires that the application be
                 divided into multiple threads. Each thread executes on
                 a separate core thereby increasing concurrency and
                 improving performance. As the number of cores on a CMP
                 continues to increase, the performance of some
                 multi-threaded applications will benefit from the
                 increased number of threads, whereas, the performance
                 of other multi-threaded applications will become
                 limited by data-synchronization and off-chip bandwidth.
                 For applications that get limited by
                 data-synchronization, increasing the number of threads
                 significantly degrades performance and increases
                 on-chip power. Similarly, for applications that get
                 limited by off-chip bandwidth, increasing the number of
                 threads increases on-chip power without providing any
                 performance improvement. Furthermore, whether an
                 application gets limited by data-synchronization, or
                 bandwidth, or neither depends not only on the
                 application but also on the input set and the machine
                 configuration. Therefore, controlling the number of
                 threads based on the run-time behavior of the
                 application can significantly improve performance and
                 reduce power.\par

                 This paper proposes Feedback-Driven Threading (FDT), a
                 framework to dynamically control the number of threads
                 using run-time information. FDT can be used to
                 implement Synchronization-Aware Threading (SAT), which
                 predicts the optimal number of threads depending on the
                 amount of data-synchronization. Our evaluation shows
                 that SAT can reduce both execution time and power by up
                 to 66\% and 78\% respectively. Similarly, FDT can be
                 used to implement Bandwidth-Aware Threading (BAT),
                 which predicts the minimum number of threads required
                 to saturate the off-chip bus. Our evaluation shows that
                 BAT reduces on-chip power by up to 78\%. When SAT and
                 BAT are combined, the average execution time reduces by
                 17\% and power reduces by 59\%. The proposed techniques
                 leverage existing performance counters and require
                 minimal support from the threading library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "bandwidth; CMP; multi-threaded; synchronization",
}

@Article{Suleman:2008:FDTb,
  author =       "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
                 Patt",
  title =        "Feedback-driven threading: power-efficient and
                 high-performance execution of multi-threaded workloads
                 on {CMPs}",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "2",
  pages =        "277--286",
  month =        mar,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1346281.1346317",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:20:12 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Extracting high-performance from the emerging Chip
                 Multiprocessors (CMPs) requires that the application be
                 divided into multiple threads. Each thread executes on
                 a separate core thereby increasing concurrency and
                 improving performance. As the number of cores on a CMP
                 continues to increase, the performance of some
                 multi-threaded applications will benefit from the
                 increased number of threads, whereas, the performance
                 of other multi-threaded applications will become
                 limited by data-synchronization and off-chip bandwidth.
                 For applications that get limited by
                 data-synchronization, increasing the number of threads
                 significantly degrades performance and increases
                 on-chip power. Similarly, for applications that get
                 limited by off-chip bandwidth, increasing the number of
                 threads increases on-chip power without providing any
                 performance improvement. Furthermore, whether an
                 application gets limited by data-synchronization, or
                 bandwidth, or neither depends not only on the
                 application but also on the input set and the machine
                 configuration. Therefore, controlling the number of
                 threads based on the run-time behavior of the
                 application can significantly improve performance and
                 reduce power.\par

                 This paper proposes Feedback-Driven Threading (FDT), a
                 framework to dynamically control the number of threads
                 using run-time information. FDT can be used to
                 implement Synchronization-Aware Threading (SAT), which
                 predicts the optimal number of threads depending on the
                 amount of data-synchronization. Our evaluation shows
                 that SAT can reduce both execution time and power by up
                 to 66\% and 78\% respectively. Similarly, FDT can be
                 used to implement Bandwidth-Aware Threading (BAT),
                 which predicts the minimum number of threads required
                 to saturate the off-chip bus. Our evaluation shows that
                 BAT reduces on-chip power by up to 78\%. When SAT and
                 BAT are combined, the average execution time reduces by
                 17\% and power reduces by 59\%. The proposed techniques
                 leverage existing performance counters and require
                 minimal support from the threading library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "bandwidth; CMP; multi-threaded; synchronization",
}

@Article{Suleman:2008:FDTc,
  author =       "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
                 Patt",
  title =        "Feedback-driven threading: power-efficient and
                 high-performance execution of multi-threaded workloads
                 on {CMPs}",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "3",
  pages =        "277--286",
  month =        mar,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1346281.1346317",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:03:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Extracting high-performance from the emerging Chip
                 Multiprocessors (CMPs) requires that the application be
                 divided into multiple threads. Each thread executes on
                 a separate core thereby increasing concurrency and
                 improving performance. As the number of cores on a CMP
                 continues to increase, the performance of some
                 multi-threaded applications will benefit from the
                 increased number of threads, whereas, the performance
                 of other multi-threaded applications will become
                 limited by data-synchronization and off-chip bandwidth.
                 For applications that get limited by
                 data-synchronization, increasing the number of threads
                 significantly degrades performance and increases
                 on-chip power. Similarly, for applications that get
                 limited by off-chip bandwidth, increasing the number of
                 threads increases on-chip power without providing any
                 performance improvement. Furthermore, whether an
                 application gets limited by data-synchronization, or
                 bandwidth, or neither depends not only on the
                 application but also on the input set and the machine
                 configuration. Therefore, controlling the number of
                 threads based on the run-time behavior of the
                 application can significantly improve performance and
                 reduce power.\par

                 This paper proposes Feedback-Driven Threading (FDT), a
                 framework to dynamically control the number of threads
                 using run-time information. FDT can be used to
                 implement Synchronization-Aware Threading (SAT), which
                 predicts the optimal number of threads depending on the
                 amount of data-synchronization. Our evaluation shows
                 that SAT can reduce both execution time and power by up
                 to 66\% and 78\% respectively. Similarly, FDT can be
                 used to implement Bandwidth-Aware Threading (BAT),
                 which predicts the minimum number of threads required
                 to saturate the off-chip bus. Our evaluation shows that
                 BAT reduces on-chip power by up to 78\%. When SAT and
                 BAT are combined, the average execution time reduces by
                 17\% and power reduces by 59\%. The proposed techniques
                 leverage existing performance counters and require
                 minimal support from the threading library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "bandwidth; CMP; multi-threaded; synchronization",
}

@Article{Thoziyoor:2008:CMM,
  author =       "Shyamkumar Thoziyoor and Jung Ho Ahn and Matteo
                 Monchiero and Jay B. Brockman and Norman P. Jouppi",
  title =        "A Comprehensive Memory Modeling Tool and Its
                 Application to the Design and Analysis of Future Memory
                 Hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "51--62",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.16",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper we introduce CACTI-D, a significant
                 enhancement of CACTI 5.0. CACTI-D adds support for
                 modeling of commodity DRAM technology and support for
                 main memory DRAM chip organization. CACTI-D enables
                 modeling of the complete memory hierarchy with
                 consistent models all the way from SRAM based L1 caches
                 through main memory DRAMs on DIMMs. We illustrate the
                 potential applicability of CACTI-D in the design and
                 analysis of future memory hierarchies by carrying out a
                 last level cache study for a multicore multithreaded
                 architecture at the 32nm technology node. In this study
                 we use CACTI-D to model all components of the memory
                 hierarchy including L1, L2, last level SRAM, logic
                 process based DRAM or commodity DRAM L3 caches, and
                 main memory DRAM chips. We carry out architectural
                 simulation using benchmarks with large data sets and
                 present results of their execution time, breakdown of
                 power in the memory hierarchy, and system energy-delay
                 product for the different system configurations. We
                 find that commodity DRAM technology is most attractive
                 for stacked last level caches, with significantly lower
                 energy-delay products.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "cache; CACTI; commodity DRAM; LLC; logic-process based
                 DRAM; SRAM",
}

@Article{Vantrease:2008:CSI,
  author =       "Dana Vantrease and Robert Schreiber and Matteo
                 Monchiero and Moray McLaren and Norman P. Jouppi and
                 Marco Fiorentino and Al Davis and Nathan Binkert and
                 Raymond G. Beausoleil and Jung Ho Ahn",
  title =        "{Corona}: System Implications of Emerging Nanophotonic
                 Technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "153--164",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382135",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We expect that many-core microprocessors will push
                 performance per chip from the 10 gigaflop to the 10
                 teraflop range in the coming decade. To support this
                 increased performance, memory and inter-core bandwidths
                 will also have to scale by orders of magnitude. Pin
                 limitations, the energy cost of electrical signaling,
                 and the non-scalability of chip-length global wires are
                 significant bandwidth impediments. Recent developments
                 in silicon nanophotonic technology have the potential
                 to meet these off- and on-stack bandwidth requirements
                 at acceptable power levels. Corona is a 3D many-core
                 architecture that uses nanophotonic communication for
                 both inter-core communication and off-stack
                 communication to memory or I/O devices. Its peak
                 floating-point performance is 10 teraflops. Dense
                 wavelength division multiplexed optically connected
                 memory modules provide 10 terabyte per second memory
                 bandwidth. A photonic crossbar fully interconnects its
                 256 low-power multithreaded cores at 20 terabyte per
                 second bandwidth. We have simulated a 1024 thread
                 Corona system running synthetic benchmarks and scaled
                 versions of the SPLASH-2 benchmark suite. We believe
                 that in comparison with an electrically-connected
                 many-core alternative that uses the same on-stack
                 interconnect power, Corona can provide 2 to 6 times
                 more performance on many memory intensive workloads,
                 while simultaneously reducing power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "3D stacking; many-core CMP; nanophotonics; on-chip
                 Networks",
}

@TechReport{Volkov:2008:LQC,
  author =       "Vasily Volkov and James W. Demmel",
  title =        "{$ L U $}, {$ Q R $} and {Cholesky} Factorizations
                 using Vector Capabilities of {GPUs}",
  type =         "LAPACK Working Note",
  number =       "202",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn202.pdf",
  abstract =     "We present performance results for dense linear
                 algebra using the 8-series NVIDIA GPUs. Our
                 matrix-matrix multiply routine (GEMM) runs 60\% faster
                 than the vendor implementation in CUBLAS 1.1 and
                 approaches the peak of hardware capabilities. Our LU,
                 QR and Cholesky factorizations achieve up to 80--90\%
                 of the peak GEMM rate. Our parallel LU running on two
                 GPUs achieves up to $ \approx $300 Gflop/s. These
                 results are accomplished by challenging the accepted
                 view of the GPU architecture and programming
                 guidelines. We argue that modern GPUs should be viewed
                 as multithreaded multicore vector units. We exploit
                 blocking similarly to vector computers and
                 heterogeneity of the system by computing both on GPU
                 and CPU. This study includes detailed benchmarking of
                 the GPU memory system that reveals sizes and latencies
                 of caches and TLB. We present a couple of algorithmic
                 optimizations aimed at increasing parallelism and
                 regularity in the problem that provide us with slightly
                 higher performance.",
  acknowledgement = ack-nhfb,
  ucbnumber =    "UCB/EECS-2008-49",
}

@Article{Wang:2008:PIM,
  author =       "Kun Wang and Yu Zhang and Huayong Wang and Xiaowei
                 Shen",
  title =        "Parallelization of {IBM Mambo} system simulator in
                 functional modes",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "1",
  pages =        "71--76",
  month =        jan,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1341312.1341325",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:19:29 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Mambo [4] is IBM's full-system simulator which models
                 PowerPC systems, and provides a complete set of
                 simulation tools to help IBM and its partners in
                 pre-hardware development and performance evaluation for
                 future systems. Currently Mambo simulates target
                 systems on a single host thread. When the number of
                 cores increases in a target system, Mambo's simulation
                 performance for each core goes down. As the so-called
                 `multi-core era' approaches, both target and host
                 systems will have more and more cores. It is very
                 important for Mambo to efficiently simulate a
                 multi-core target system on a multi-core host system.
                 Parallelization is a natural method to speed up Mambo
                 under this situation.\par

                 Parallel Mambo (P-Mambo) is a multi-threaded
                 implementation of Mambo. Mambo's simulation engine is
                 implemented as a user-level thread-scheduler. We
                 propose a multi-scheduler method to adapt Mambo's
                 simulation engine to multi-threaded execution. Based on
                 this method a core-based module partition is proposed
                 to achieve both high inter-scheduler parallelism and
                 low inter-scheduler dependency. Protection of shared
                 resources is crucial to both correctness and
                 performance of P-Mambo. Since there are two tiers of
                 threads in P-Mambo, protecting shared resources by only
                 OS-level locks possibly introduces deadlocks due to
                 user-level context switch. We propose a new lock
                 mechanism to handle this problem. Since Mambo is an
                 on-going project with many modules currently under
                 development, co-existence with new modules is also
                 important to P-Mambo. We propose a global-lock-based
                 method to guarantee compatibility of P-Mambo with
                 future Mambo modules.\par

                 We have implemented the first version of P-Mambo in
                 functional modes. The performance of P-Mambo has been
                 evaluated on the OpenMP implementation of NAS Parallel
                 Benchmark (NPB) 3.2 [12]. Preliminary experimental
                 results show that P-Mambo achieves an average speedup
                 of 3.4 on a 4-core host machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "architectural simulation; dynamic binary translation;
                 parallel simulation",
}

@Article{Warg:2008:DTS,
  author =       "Fredrik Warg and Per Stenstrom",
  title =        "Dual-thread Speculation: a Simple Approach to Uncover
                 Thread-level Parallelism on a Simultaneous
                 Multithreaded Processor",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "36",
  number =       "2",
  pages =        "166--183",
  month =        apr,
  year =         "2008",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-007-0064-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Jul 9 16:07:03 MDT 2008",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=166",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
  keywords =     "Chip multiprocessors; Computer architecture;
                 Simultaneous multithreading; Thread-level parallelism;
                 Thread-level speculation",
}

@Book{Weaver:2008:OIO,
  editor =       "David L. Weaver",
  title =        "{OpenSPARC} Internals: {OpenSPARC T1\slash T2} Chip
                 Multithreaded Throughput Computing",
  publisher =    "Lulu, Inc.",
  address =      "860 Aviation Parkway, Suite 300, Morrisville, NC
                 27560, USA",
  pages =        "xviii + 369",
  year =         "2008",
  ISBN =         "0-557-01974-5",
  ISBN-13 =      "978-0-557-01974-8",
  LCCN =         "????",
  bibdate =      "Tue Nov 11 14:49:47 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  price =        "US\$20.00",
  URL =          "http://www.opensparc.net/publications/books/opensparc-internals.html",
  acknowledgement = ack-nhfb,
  libnote =      "Not yet in my library.",
}

@Article{Wegiel:2008:MCVa,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The mapping collector: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Wegiel:2008:MCVb,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The {Mapping Collector}: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "2",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Jun 20 17:20:12 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Wegiel:2008:MCVc,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The mapping collector: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "3",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jun 18 11:03:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Winter:2008:ATN,
  author =       "Jonathan A. Winter and David H. Albonesi",
  title =        "Addressing thermal nonuniformity in {SMT} workloads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1369396.1369400",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We explore DTM techniques within the context of
                 uniform and nonuniform SMT workloads. While DVS is
                 suitable for addressing workloads with uniformly high
                 temperatures, for nonuniform workloads, performance
                 loss occurs because of the slowdown of the cooler
                 thread. To address this, we propose and evaluate DTM
                 mechanisms that exploit the steering-based thread
                 management mechanisms inherent in a clustered SMT
                 architecture. We show that in contrast to DVS, which
                 operates globally, our techniques are more effective at
                 controlling temperature for nonuniform workloads.
                 Furthermore, we devise a DTM technique that combines
                 steering and DVS to achieve consistently good
                 performance across all workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive microarchitectures; clustered
                 microarchitectures; dynamic thermal management; dynamic
                 voltage scaling; simultaneous multithreading",
}

@Article{Wong:2008:TAF,
  author =       "Chee Siang Wong and Ian Tan and Rosalind Deena Kumari
                 and Fun Wey",
  title =        "Towards achieving fairness in the {Linux} scheduler",
  journal =      j-OPER-SYS-REV,
  volume =       "42",
  number =       "5",
  pages =        "34--43",
  month =        jul,
  year =         "2008",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1400097.1400102",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Wed Aug 6 16:54:12 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The Operating System scheduler is designed to allocate
                 the CPU resources appropriately to all processes. The
                 Linux Completely Fair Scheduler (CFS) design ensures
                 fairness among tasks using the thread fair scheduling
                 algorithm. This algorithm ensures allocation of
                 resources based on the number of threads in the system
                 and not within executing programs. This can lead to
                 fairness issue in a multi-threaded environment as the
                 Linux scheduler tends to favor programs with higher
                 number of threads. We illustrate the issue of fairness
                 through experimental evaluation thus exposing the
                 weakness of the current allocation scheme where
                 software developers could take advantage by spawning
                 many additional threads in order to obtain more CPU
                 resources. A novel algorithm is proposed as a solution
                 towards achieving better fairness in the Linux
                 scheduler. The algorithm is based on weight
                 readjustment of the threads created in the same process
                 to significantly reduce the unfair allocation of CPU
                 resources in multi-threaded environments. The algorithm
                 was implemented and evaluated. It demonstrated
                 promising results towards solving the raised fairness
                 issue. We conclude this paper highlighting the
                 limitations of the proposed approach and the future
                 work in the stated direction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "completely fair scheduler; fairness; Linux; process
                 scheduling",
}

@Article{Xian:2008:CAS,
  author =       "Feng Xian and Witawas Srisa-an and Hong Jiang",
  title =        "Contention-aware scheduler: unlocking execution
                 parallelism in multithreaded {Java} programs",
  journal =      j-SIGPLAN,
  volume =       "43",
  number =       "10",
  pages =        "163--180",
  month =        sep,
  year =         "2008",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1449955.1449778",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 22 09:57:37 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In multithreaded programming, locks are frequently
                 used as a mechanism for synchronization. Because
                 today's operating systems do not consider lock usage as
                 a scheduling criterion, scheduling decisions can be
                 unfavorable to multithreaded applications, leading to
                 performance issues such as convoying and heavy lock
                 contention in systems with multiple processors.
                 Previous efforts to address these issues (e.g.,
                 transactional memory, lock-free data structure) often
                 treat scheduling decisions as 'a fact of life,' and
                 therefore these solutions try to cope with the
                 consequences of undesirable scheduling instead of
                 dealing with the problem directly.\par

                 In this paper, we introduce {\em Contention-Aware
                 Scheduler (CA-Scheduler)}, which is designed to support
                 efficient execution of large multithreaded Java
                 applications in multiprocessor systems. Our proposed
                 scheduler employs a scheduling policy that reduces lock
                 contention. As will be shown in this paper, our
                 prototype implementation of the CA-Scheduler in Linux
                 and Sun HotSpot virtual machine only incurs 3.5\%
                 runtime overhead, while the overall performance
                 differences, when compared with a system with no
                 contention awareness, range from a degradation of 3\%
                 in a small multithreaded benchmark to an improvement of
                 15\% in a large Java application server benchmark.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Java; operating systems; scheduling",
}

@Article{Ahn:2009:MDE,
  author =       "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber
                 and Norman P. Jouppi",
  title =        "Multicore {DIMM}: an Energy Efficient Memory Module
                 with Independently Controlled {DRAMs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Demand for memory capacity and bandwidth keeps
                 increasing rapidly in modern computer systems, and
                 memory power consumption is becoming a considerable
                 portion of the system power budget. However, the
                 current DDR DIMM standard is not well suited to
                 effectively serve CMP memory requests from both a power
                 and performance perspective. We propose a new memory
                 module called a Multicore DIMM, where DRAM chips are
                 grouped into multiple virtual memory devices, each of
                 which has its own data path and receives separate
                 commands (address and control signals). The Multicore
                 DIMM is designed to improve the energy efficiency of
                 memory systems with small impact on system performance.
                 Dividing each memory modules into 4 virtual memory
                 devices brings a simultaneous 22\%, 7.6\%, and 18\%
                 improvement in memory power, IPC, and system
                 energy-delay product respectively on a set of
                 multithreaded applications and consolidated
                 workloads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ahn, JH (Reprint Author), Hewlett Packard Labs,
                 Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber,
                 Robert S.; Jouppi, Norman P., Hewlett Packard Labs,
                 Mississauga, ON, Canada. Leverich, Jacob, Stanford
                 Univ, Stanford, CA 94305 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "DRAM; memory module; memory system; Multicore",
  number-of-cited-references = "16",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
  research-areas = "Computer Science",
  researcherid-numbers = "Ahn, Jung Ho/D-1298-2013",
  times-cited =  "26",
  unique-id =    "Ahn:2009:MDE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Aleen:2009:CAS,
  author =       "Farhana Aleen and Nathan Clark",
  title =        "Commutativity analysis for software parallelization:
                 letting program transformations see the big picture",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "241--252",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508284.1508273",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Extracting performance from many-core architectures
                 requires software engineers to create multi-threaded
                 applications, which significantly complicates the
                 already daunting task of software development. One
                 solution to this problem is automatic compile-time
                 parallelization, which can ease the burden on software
                 developers in many situations. Clearly, automatic
                 parallelization in its present form is not suitable for
                 many application domains and new compiler analyses are
                 needed address its shortcomings.\par

                 In this paper, we present one such analysis: a new
                 approach for detecting commutative functions.
                 Commutative functions are sections of code that can be
                 executed in any order without affecting the outcome of
                 the application, e.g., inserting elements into a set.
                 Previous research on this topic had one significant
                 limitation, in that the results of a commutative
                 functions must produce identical memory layouts. This
                 prevented previous techniques from detecting functions
                 like malloc, which may return different pointers
                 depending on the order in which it is called, but these
                 differing results do not affect the overall output of
                 the application. Our new commutativity analysis
                 correctly identify these situations to better
                 facilitate automatic parallelization. We demonstrate
                 that this analysis can automatically extract
                 significant amounts of parallelism from many
                 applications, and where it is ineffective it can
                 provide software developers a useful list of functions
                 that may be commutative provided semantic program
                 changes that are not automatable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "automatic software parallelization; commutative
                 functions; random interpretation",
}

@Article{Amamiya:2009:CBN,
  author =       "Satoshi Amamiya and Makoto Amamiya and Ryuzo Hasegawa
                 and Hiroshi Fujita",
  title =        "A continuation-based noninterruptible multithreading
                 processor architecture",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "47",
  number =       "2",
  pages =        "228--252",
  month =        feb,
  year =         "2009",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Aug 25 08:38:29 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=47&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=47&issue=2&spage=228",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Anderson:2009:LAC,
  author =       "Zachary R. Anderson and David Gay and Mayur Naik",
  title =        "Lightweight annotations for controlling sharing in
                 concurrent data structures",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "98--109",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1542476.1542488",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "SharC is a recently developed system for checking
                 data-sharing in multithreaded programs. Programmers
                 specify sharing rules (read-only, protected by a lock,
                 etc.) for individual objects, and the SharC compiler
                 enforces these rules using static and dynamic checks.
                 Violations of these rules indicate unintended data
                 sharing, which is the underlying cause of harmful
                 data-races. Additionally, SharC allows programmers to
                 change the sharing rules for a specific object using a
                 {\em sharing cast}, to capture the fact that sharing
                 rules for an object often change during the object's
                 lifetime. SharC was successfully applied to a number of
                 multi-threaded C programs.\par

                 However, many programs are not readily checkable using
                 SharC because their sharing rules, and changes to
                 sharing rules, effectively apply to whole data
                 structures rather than to individual objects. We have
                 developed a system called {\em Shoal\/} to address this
                 shortcoming. In addition to the sharing rules and
                 sharing cast of SharC, our system includes a new
                 concept that we call {\em groups}. A group is a
                 collection of objects all having the same sharing mode.
                 Each group has a distinguished member called the {\em
                 group leader}. When the sharing mode of the group
                 leader changes by way of a sharing cast, the sharing
                 mode of all members of the group also changes. This
                 operation is made sound by maintaining the invariant
                 that at the point of a sharing cast, the only external
                 pointer into the group is the pointer to the group
                 leader. The addition of groups allows checking safe
                 concurrency at the level of data structures rather than
                 at the level of individual objects.\par

                 We demonstrate the necessity and practicality of groups
                 by applying Shoal to a wide range of concurrent C
                 programs (the largest approaching a million lines of
                 code). In all benchmarks groups entail low annotation
                 burden and no significant additional performance
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrent programming; data races; multithreaded
                 programming",
}

@Article{Antonopoulos:2009:ASH,
  author =       "Christos D. Antonopoulos and Filip Blagojevic and
                 Andrey N. Chernikov and Nikos P. Chrisochoides and
                 Dimitrios S. Nikolopoulos",
  title =        "Algorithm, software, and hardware optimizations for
                 {Delaunay} mesh generation on simultaneous
                 multithreaded architectures",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "69",
  number =       "7",
  pages =        "601--612",
  month =        jul,
  year =         "2009",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 16:27:25 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Azizi:2009:AEC,
  author =       "Omid Azizi and Aqeel Mahesri and Sanjay J. Patel and
                 Mark Horowitz",
  title =        "Area-efficiency in {CMP} core design: co-optimization
                 of microarchitecture and physical design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "56--65",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577138",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper, we examine the area-performance design
                 space of a processing core for a chip multiprocessor
                 (CMP), considering both the architectural design space
                 and the tradeoffs of the physical design on which the
                 architecture relies. We first propose a methodology for
                 performing an integrated optimization of both the
                 micro-architecture and the physical circuit design of a
                 microprocessor. In our approach, we use statistical and
                 convex fitting methods to capture a large
                 micro-architectural design space. We then characterize
                 the area-delay tradeoffs of the underlying circuits
                 through RTL synthesis. Finally, we establish the
                 relationship between the architecture and the circuits
                 in an integrative model, which we use to optimize the
                 processor. As a case study, we apply this methodology
                 to explore the performance-area tradeoffs in a highly
                 parallel accelerator architecture for visual computing
                 applications. Based on some early circuit tradeoff
                 data, our results indicate that two separate designs
                 are performance/area optimal for our set of benchmarks:
                 a simpler single-issue, 2-way multithreaded core
                 running at high-frequency, and a more aggressively
                 tuned dual-issue 4-way multithreaded design running at
                 a lower frequency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Barkstrom:2009:UAS,
  author =       "Bruce R. Barkstrom",
  title =        "On using {Ada} to solve problems in computational
                 economics and related disciplines with concurrent,
                 multiagent algorithms",
  journal =      j-SIGADA-LETTERS,
  volume =       "29",
  number =       "3",
  pages =        "61--72",
  month =        dec,
  year =         "2009",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/1647420.1647437",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Mon Jun 21 14:04:37 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multiagent algorithms are widely used in computational
                 economics and other social sciences to solve
                 theoretical and practical problems. Because such
                 algorithms are inherently concurrent and multithreaded,
                 Ada's constructs for handling communications between
                 concurrent processes and avoiding interference between
                 them make the language very well suited to solving
                 these problems, particularly given developments in
                 multi-core CPU chip-making. This paper provides a
                 concrete example of how Ada assists in solving problems
                 in computational economics and related disciplines that
                 work with multiagent systems. Solving a simple problem
                 illustrates visualizing the agents as Ada tasks, using
                 UML use cases and synchronization diagrams to design
                 the communications patterns between agents, and
                 applying protected objects and functions to avoid
                 computational indeterminacy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGAda Ada Letters",
  keywords =     "computational and mathematical organization theory;
                 computational economics; concurrent programming;
                 multiagent systems; multithreaded programming",
}

@Article{Barnes:2009:XBA,
  author =       "Christopher Barnes and Pranav Vaidya and Jaehwan John
                 Lee",
  title =        "An {XML}-Based {ADL} Framework for Automatic
                 Generation of Multithreaded Computer Architecture
                 Simulators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2009",
  DOI =          "https://doi.org/10.1109/L-CA.2009.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Computer architecture simulation has always played a
                 pivotal role in continuous innovation of computers.
                 However, constructing or modifying a high quality
                 simulator is time consuming and error-prone. Thus,
                 often Architecture Description Languages (ADLs) are
                 used to provide an abstraction layer for describing the
                 computer architecture and automatically generating
                 corresponding simulators. Along the line of such
                 research, we present a novel XML-based ADL, its
                 compiler, and a generation methodology to automatically
                 generate multithreaded simulators for computer
                 architecture. We utilize the industry-standard
                 extensible markup language XML to describe the
                 functionality and architecture of a modeled processor.
                 Our ADL framework allows users to easily and quickly
                 modify the structure, register set, and execution of a
                 modeled processor. To prove its validity, we have
                 generated several multithreaded simulators with
                 different configurations based on the MIPS five-stage
                 processor, and successfully tested with two programs.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "IUPUI RSFG",
  funding-text = "This research was funded by the IUPUI RSFG grant.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "C.0.d Modeling of computer architecture; C.1.1.b
                 Pipeline processors",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Barnes:2009:XBA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Berger:2009:GSM,
  author =       "Emery D. Berger and Ting Yang and Tongping Liu and
                 Gene Novark",
  title =        "{Grace}: safe multithreaded programming for {C\slash
                 C++}",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "10",
  pages =        "81--96",
  month =        oct,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1640089.1640096",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jun 21 18:01:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The shift from single to multiple core architectures
                 means that programmers must write concurrent,
                 multithreaded programs in order to increase application
                 performance. Unfortunately, multithreaded applications
                 are susceptible to numerous errors, including
                 deadlocks, race conditions, atomicity violations, and
                 order violations. These errors are notoriously
                 difficult for programmers to debug.\par

                 This paper presents Grace, a software-only runtime
                 system that eliminates concurrency errors for a class
                 of multithreaded programs: those based on fork-join
                 parallelism. By turning threads into processes,
                 leveraging virtual memory protection, and imposing a
                 sequential commit protocol, Grace provides programmers
                 with the appearance of deterministic, sequential
                 execution, while taking advantage of available
                 processing cores to run code concurrently and
                 efficiently. Experimental results demonstrate Grace's
                 effectiveness: with modest code changes across a suite
                 of computationally-intensive benchmarks (1-16 lines),
                 Grace can achieve high scalability and performance
                 while preventing concurrency errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; determinism; deterministic concurrency;
                 fork-join; sequential semantics",
}

@Article{Bocchino:2009:TES,
  author =       "Robert L. {Bocchino, Jr.} and Vikram S. Adve and Danny
                 Dig and Sarita V. Adve and Stephen Heumann and Rakesh
                 Komuravelli and Jeffrey Overbey and Patrick Simmons and
                 Hyojin Sung and Mohsen Vakilian",
  title =        "A type and effect system for deterministic parallel
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "10",
  pages =        "97--116",
  month =        oct,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1639949.1640097",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jun 21 18:01:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Today's shared-memory parallel programming models are
                 complex and error-prone. While many parallel programs
                 are intended to be deterministic, unanticipated thread
                 interleavings can lead to subtle bugs and
                 nondeterministic semantics. In this paper, we
                 demonstrate that a practical {\em type and effect
                 system\/} can simplify parallel programming by {\em
                 guaranteeing deterministic semantics\/} with modular,
                 compile-time type checking even in a rich, concurrent
                 object-oriented language such as Java. We describe an
                 object-oriented type and effect system that provides
                 several new capabilities over previous systems for
                 expressing deterministic parallel algorithms. We also
                 describe a language called Deterministic Parallel Java
                 (DPJ) that incorporates the new type system features,
                 and we show that a core subset of DPJ is sound. We
                 describe an experimental validation showing that DPJ
                 can express a wide range of realistic parallel
                 programs; that the new type system features are useful
                 for such programs; and that the parallel programs
                 exhibit good performance gains (coming close to or
                 beating equivalent, nondeterministic multithreaded
                 programs where those are available).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "commutativity; determinism; deterministic parallelism;
                 effect systems; effects",
}

@Article{Bratanov:2009:VMW,
  author =       "Stanislav Bratanov and Roman Belenov and Nikita
                 Manovich",
  title =        "Virtual machines: a whole new world for performance
                 analysis",
  journal =      j-OPER-SYS-REV,
  volume =       "43",
  number =       "2",
  pages =        "46--55",
  month =        apr,
  year =         "2009",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1531793.1531802",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Apr 23 19:43:22 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This article addresses a problem of performance
                 monitoring inside virtual machines (VMs). It advocates
                 focused monitoring of particular virtualized programs,
                 explains the need for and the importance of such an
                 approach to performance monitoring in virtualized
                 execution environments, and emphasizes its benefits for
                 virtual machine manufacturers, virtual machine users
                 (mostly, software developers) and hardware (processor)
                 manufacturers. The article defines the problem of in-VM
                 performance monitoring as the ability to employ modern
                 methods and hardware performance monitoring
                 capabilities inside virtual machines to an extent
                 comparable with what is being done in real
                 environments. Unfortunately, there are numerous reasons
                 preventing us from achieving such an ambitious goal,
                 one of those reasons being the lack of support from
                 virtualization engines; that is why a novel method of
                 'cooperative' performance data collection is disclosed.
                 The method implies collection of performance data at
                 physical hardware and simultaneous tracking of software
                 states inside a virtual machine. Each statistically
                 visible execution point of the virtualized software may
                 then be associated with information on real hardware
                 events. The method effectively enables time-based
                 sampling of virtualized workloads combined with
                 hardware event counting, is applicable to unmodified,
                 commercially available virtual machines, and has
                 competitive precision and overhead. The practical
                 significance and value of the method are further
                 illustrated by studying a parallel workload and
                 uncovering virtualization-specific performance issues
                 of multithreaded programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "hardware performance event counters; virtual
                 machines",
}

@Article{Choi:2009:HCS,
  author =       "Seungryul Choi and Donald Yeung",
  title =        "Hill-climbing {SMT} processor resource distribution",
  journal =      j-TOCS,
  volume =       "27",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2009",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Feb 13 18:30:25 MST 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The key to high performance in Simultaneous
                 MultiThreaded (SMT) processors lies in optimizing the
                 distribution of shared resources to active threads.
                 Existing resource distribution techniques optimize
                 performance only indirectly. They infer potential
                 performance bottlenecks by observing indicators, like
                 instruction occupancy or cache miss counts, and take
                 actions to try to alleviate them. While the corrective
                 actions are designed to improve performance, their
                 actual performance impact is not known since end
                 performance is never monitored. Consequently, potential
                 performance gains are lost whenever the corrective
                 actions do not effectively address the actual
                 bottlenecks occurring in the pipeline.\par

                 We propose a different approach to SMT resource
                 distribution that optimizes end performance directly.
                 Our approach observes the impact that resource
                 distribution decisions have on performance at runtime,
                 and feeds this information back to the resource
                 distribution mechanisms to improve future decisions. By
                 evaluating many different resource distributions, our
                 approach tries to learn the best distribution over
                 time. Because we perform learning online, learning time
                 is crucial. We develop a hill-climbing algorithm that
                 quickly learns the best distribution of resources by
                 following the performance gradient within the resource
                 distribution space. We also develop several ideal
                 learning algorithms to enable deeper insights through
                 limit studies.\par

                 This article conducts an in-depth investigation of
                 hill-climbing SMT resource distribution using a
                 comprehensive suite of 63 multiprogrammed workloads.
                 Our results show hill-climbing outperforms ICOUNT,
                 FLUSH, and DCRA (three existing SMT techniques) by
                 11.4\%, 11.5\%, and 2.8\%, respectively, under the
                 weighted IPC metric. A limit study conducted using our
                 ideal learning algorithms shows our approach can
                 potentially outperform the same techniques by 19.2\%,
                 18.0\%, and 7.6\%, respectively, thus demonstrating
                 additional room exists for further improvement. Using
                 our ideal algorithms, we also identify three
                 bottlenecks that limit online learning speed: local
                 maxima, phased behavior, and interepoch jitter. We
                 define metrics to quantify these learning bottlenecks,
                 and characterize the extent to which they occur in our
                 workloads. Finally, we conduct a sensitivity study, and
                 investigate several extensions to improve our
                 hill-climbing technique.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Book{Cormen:2009:IA,
  editor =       "Thomas H. Cormen and Charles Eric Leiserson and Ronald
                 L. Rivest and Clifford Stein",
  title =        "Introduction to algorithms",
  publisher =    pub-MIT,
  address =      pub-MIT:adr,
  edition =      "Third",
  pages =        "xix + 1292",
  year =         "2009",
  ISBN =         "0-262-03384-4 (hardcover), 0-262-53305-7 (paperback)",
  ISBN-13 =      "978-0-262-03384-8 (hardcover), 978-0-262-53305-8
                 (paperback)",
  LCCN =         "QA76.6 .C662 2009",
  bibdate =      "Thu Sep 9 14:42:33 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 z3950.loc.gov:7090/Voyager",
  abstract =     "Some books on algorithms are rigorous but incomplete;
                 others cover masses of material but lack rigor.
                 Introduction to Algorithms uniquely combines rigor and
                 comprehensiveness. The book covers a broad range of
                 algorithms in depth, yet makes their design and
                 analysis accessible to all levels of readers. Each
                 chapter is relatively self-contained and can be used as
                 a unit of study. The algorithms are described in
                 English and in a pseudocode designed to be readable by
                 anyone who has done a little programming. The
                 explanations have been kept elementary without
                 sacrificing depth of coverage or mathematical rigor.
                 The first edition became a widely used text in
                 universities worldwide as well as the standard
                 reference for professionals. The second edition
                 featured new chapters on the role of algorithms,
                 probabilistic analysis and randomized algorithms, and
                 linear programming. The third edition has been revised
                 and updated throughout. It includes two completely new
                 chapters, on van Emde Boas trees and multithreaded
                 algorithms, and substantial additions to the chapter on
                 recurrences (now called ``Divide-and-Conquer''). It
                 features improved treatment of dynamic programming and
                 greedy algorithms and a new notion of edge-based flow
                 in the material on flow networks. Many new exercises
                 and problems have been added for this edition.",
  acknowledgement = ack-nhfb,
  libnote =      "Not in my library.",
  subject =      "Computer programming; Computer algorithms",
}

@Article{Daniluk:2009:MTS,
  author =       "Andrzej Daniluk",
  title =        "Multithreaded transactions in scientific computing.
                 {The} {Growth06\_v2} program",
  journal =      j-COMP-PHYS-COMM,
  volume =       "180",
  number =       "7",
  pages =        "1219--1220",
  month =        jul,
  year =         "2009",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2009.01.024",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Mon Feb 13 23:42:43 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465509000393",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{deBoer:2009:SVC,
  author =       "F. S. de Boer",
  title =        "A shared-variable concurrency analysis of
                 multi-threaded object-oriented programs",
  journal =      j-THEOR-COMP-SCI,
  volume =       "410",
  number =       "2--3",
  pages =        "128--141",
  day =          "6",
  month =        feb,
  year =         "2009",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Mon Mar 28 21:21:46 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/03043975",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975",
}

@Article{Desai:2009:AIC,
  author =       "Aniruddha Desai and Jugdutt Singh",
  title =        "Architecture Independent Characterization of Embedded
                 {Java} Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This paper presents architecture independent
                 characterization of embedded Java workloads based on
                 the industry standard GrinderBench benchmark which
                 includes different classes of real world embedded Java
                 applications. This work is based on a custom built
                 embedded Java Virtual Machine (JVM) simulator
                 specifically designed for embedded JVM modeling and
                 embodies domain specific details such as thread
                 scheduling, algorithms used for native CLDC APIs and
                 runtime data structures optimized for use in embedded
                 systems. The results presented include dynamic
                 execution characteristics, dynamic bytecode instruction
                 mix, application and API workload distribution, Object
                 allocation statistics, instruction-set coverage, memory
                 usage statistics and method code and stack frame
                 characteristics.",
  acknowledgement = ack-nhfb,
  affiliation =  "Desai, A (Reprint Author), La Trobe Univ, Bundoora,
                 Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt,
                 La Trobe Univ, Bundoora, Vic 3086, Australia.",
  author-email = "desai@ieee.org",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; application program
                 interfaces; architecture independent characterization;
                 CLDC API; custom built embedded Java virtual machine
                 simulator; data structures; Data structures; Design
                 optimization; dynamic bytecode instruction mix; dynamic
                 execution characteristics; embedded Java workload;
                 Embedded Systems; embedded systems; Embedded Systems;
                 industry standard GrinderBench benchmark; instruction
                 sets; instruction-set coverage; Java; Java bytecode;
                 Job shop scheduling; JVM; memory usage statistics;
                 method code characteristics; multi-threading; object
                 allocation statistics; Runtime; runtime data structure;
                 scheduling; Scheduling algorithm; stack frame
                 characteristics; Statistical distributions; storage
                 allocation; thread scheduling; virtual machines;
                 Virtual machining; Workload Characterization",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Desai:2009:AIC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Devietti:2009:DDS,
  author =       "Joseph Devietti and Brandon Lucia and Luis Ceze and
                 Mark Oskin",
  title =        "{DMP}: deterministic shared memory multiprocessing",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "85--96",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508244.1508255",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Current shared memory multicore and multiprocessor
                 systems are nondeterministic. Each time these systems
                 execute a multithreaded application, even if supplied
                 with the same input, they can produce a different
                 output. This frustrates debugging and limits the
                 ability to properly test multithreaded code, becoming a
                 major stumbling block to the much-needed widespread
                 adoption of parallel programming.\par

                 In this paper we make the case for fully deterministic
                 shared memory multiprocessing (DMP). The behavior of an
                 arbitrary multithreaded program on a DMP system is only
                 a function of its inputs. The core idea is to make
                 inter-thread communication fully deterministic.
                 Previous approaches to coping with nondeterminism in
                 multithreaded programs have focused on replay, a
                 technique useful only for debugging. In contrast, while
                 DMP systems are directly useful for debugging by
                 offering repeatability by default, we argue that
                 parallel programs should execute deterministically in
                 the field as well. This has the potential to make
                 testing more assuring and increase the reliability of
                 deployed multithreaded software. We propose a range of
                 approaches to enforcing determinism and discuss their
                 implementation trade-offs. We show that determinism can
                 be provided with little performance cost using our
                 architecture proposals on future hardware, and that
                 software-only approaches can be utilized on existing
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "debugging; determinism; multicores; parallel
                 programming",
}

@Article{Eyerman:2009:MLP,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Memory-level parallelism aware fetch policies for
                 simultaneous multithreading processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1509864.1509867",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "A thread executing on a simultaneous multithreading
                 (SMT) processor that experiences a long-latency load
                 will eventually stall while holding execution
                 resources. Existing long-latency load aware SMT fetch
                 policies limit the amount of resources allocated by a
                 stalled thread by identifying long-latency loads and
                 preventing the thread from fetching more instructions
                 --- and in some implementations, instructions beyond
                 the long-latency load are flushed to release allocated
                 resources.\par

                 This article proposes an SMT fetch policy that takes
                 into account the available memory-level parallelism
                 (MLP) in a thread. The key idea proposed in this
                 article is that in case of an isolated long-latency
                 load (i.e., there is no MLP), the thread should be
                 prevented from allocating additional resources.
                 However, in case multiple independent long-latency
                 loads overlap (i.e., there is MLP), the thread should
                 allocate as many resources as needed in order to fully
                 expose the available MLP. MLP-aware fetch policies
                 achieve better performance for MLP-intensive threads on
                 SMT processors, leading to higher overall system
                 throughput and shorter average turnaround time than
                 previously proposed fetch policies.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Fetch Policy; Memory-Level Parallelism (MLP);
                 Simultaneous Multithreading (SMT)",
}

@Article{Eyerman:2009:PTC,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Per-thread cycle accounting in {SMT} processors",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "133--144",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508284.1508260",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper proposes a cycle accounting architecture
                 for Simultaneous Multithreading (SMT) processors that
                 estimates the execution times for each of the threads
                 had they been executed alone, while they are running
                 simultaneously on the SMT processor. This is done by
                 accounting each cycle to either a base, miss event or
                 waiting cycle component during multi-threaded
                 execution. Single-threaded alone execution time is then
                 estimated as the sum of the base and miss event
                 components; the waiting cycle component represents the
                 lost cycle count due to SMT execution. The cycle
                 accounting architecture incurs reasonable hardware cost
                 (around 1KB of storage) and estimates single-threaded
                 performance with average prediction errors around 7.2\%
                 for two-program workloads and 11.7\% for four-program
                 workloads.\par

                 The cycle accounting architecture has several important
                 applications to system software and its interaction
                 with SMT hardware. For one, the estimated single-thread
                 alone execution time provides an accurate picture to
                 system software of the actually consumed processor
                 cycles per thread. The alone execution time instead of
                 the total execution time (timeslice) may make system
                 software scheduling policies more effective. Second, a
                 new class of thread-progress aware SMT fetch policies
                 based on per-thread progress indicators enable system
                 software level priorities to be enforced at the
                 hardware level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cycle accounting; simultaneous multithreading (SMT);
                 thread-progress aware fetch policy",
}

@Article{Flanagan:2009:FEP,
  author =       "Cormac Flanagan and Stephen N. Freund",
  title =        "{FastTrack}: efficient and precise dynamic race
                 detection",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "121--133",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1542476.1542490",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2000.bib",
  abstract =     "Multithreaded programs are notoriously prone to race
                 conditions. Prior work on dynamic race detectors
                 includes fast but imprecise race detectors that report
                 false alarms, as well as slow but precise race
                 detectors that never report false alarms. The latter
                 typically use expensive vector clock operations that
                 require time linear in the number of program
                 threads.\par

                 This paper exploits the insight that the full
                 generality of vector clocks is unnecessary in most
                 cases. That is, we can replace heavyweight vector
                 clocks with an adaptive lightweight representation
                 that, for almost all operations of the target program,
                 requires only constant space and supports constant-time
                 operations. This representation change significantly
                 improves time and space performance, with no loss in
                 precision.\par

                 Experimental results on Java benchmarks including the
                 Eclipse development environment show that our FastTrack
                 race detector is an order of magnitude faster than a
                 traditional vector-clock race detector, and roughly
                 twice as fast as the high-performance DJIT+ algorithm.
                 FastTrack is even comparable in speed to Eraser on our
                 Java benchmarks, while never reporting false alarms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; dynamic analysis; race conditions",
}

@Article{Fung:2009:DWF,
  author =       "Wilson W. L. Fung and Ivan Sham and George Yuan and
                 Tor M. Aamodt",
  title =        "Dynamic warp formation: {Efficient MIMD} control flow
                 on {SIMD} graphics hardware",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543753.1543756",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Recent advances in graphics processing units (GPUs)
                 have resulted in massively parallel hardware that is
                 easily programmable and widely available in today's
                 desktop and notebook computer systems. GPUs typically
                 use single-instruction, multiple-data (SIMD) pipelines
                 to achieve high performance with minimal overhead for
                 control hardware. Scalar threads running the same
                 computing kernel are grouped together into SIMD
                 batches, sometimes referred to as warps. While SIMD is
                 ideally suited for simple programs, recent GPUs include
                 control flow instructions in the GPU instruction set
                 architecture and programs using these instructions may
                 experience reduced performance due to the way branch
                 execution is supported in hardware. One solution is to
                 add a stack to allow different SIMD processing elements
                 to execute distinct program paths after a branch
                 instruction. The occurrence of diverging branch
                 outcomes for different processing elements
                 significantly degrades performance using this approach.
                 In this article, we propose dynamic warp formation and
                 scheduling, a mechanism for more efficient SIMD branch
                 execution on GPUs. It dynamically regroups threads into
                 new warps on the fly following the occurrence of
                 diverging branch outcomes. We show that a realistic
                 hardware implementation of this mechanism improves
                 performance by 13\%, on average, with 256 threads per
                 core, 24\% with 512 threads, and 47\% with 768 threads
                 for an estimated area increase of 8\%.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "control flow; fine-grained multithreading; GPU; SIMD",
}

@Article{Gabor:2009:SLA,
  author =       "Ron Gabor and Avi Mendelson and Shlomo Weiss",
  title =        "Service level agreement for multithreaded processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543753.1543755",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multithreading is widely used to increase processor
                 throughput. As the number of shared resources increase,
                 managing them while guaranteeing predicted performance
                 becomes a major problem. Attempts have been made in
                 previous work to ease this via different fairness
                 mechanisms. In this article, we present a new approach
                 to control the resource allocation and sharing via a
                 service level agreement (SLA)-based mechanism; that is,
                 via an agreement in which multithreaded processors
                 guarantee a minimal level of service to the running
                 threads. We introduce a new metric, {\em C\/}$_{SLA}$,
                 for conformance to SLA in multithreaded processors and
                 show that controlling resources using with SLA allows
                 for higher gains than are achievable by previously
                 suggested fairness techniques. It also permits
                 improving one metric (e.g., power) while maintaining
                 SLA in another (e.g., performance). We compare SLA
                 enforcement to schemes based on other fairness metrics,
                 which are mostly targeted at equalizing execution
                 parameters. We show that using SLA rather than fairness
                 based algorithms provides a range of acceptable
                 execution points from which we can select the point
                 that best fits our optimization target, such as
                 maximizing the weighted speedup (sum of the speedups of
                 the individual threads) or reducing power. We
                 demonstrate the effectiveness of the new SLA approach
                 using switch-on-event (coarse-grained) multithreading.
                 Our weighted speedup improvement scheme successfully
                 enforces SLA while improving the weighted speedup by an
                 average of 10\% for unbalanced threads. This result is
                 significant when compared with performance losses that
                 may be incurred by fairness enforcement methods. When
                 optimizing for power reduction in unbalanced threads
                 SLA enforcement reduces the power by an average of
                 15\%. SLA may be complemented by other power reduction
                 methods to achieve further power savings {\em and\/}
                 maintain the same service level for the threads. We
                 also demonstrate differentiated SLA, where weighted
                 speedup is maximized while each thread may have a
                 different throughput constraint.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fairness; performance; power; Service level agreement;
                 throughput",
}

@Article{Ganty:2009:VLA,
  author =       "Pierre Ganty and Rupak Majumdar and Andrey
                 Rybalchenko",
  title =        "Verifying liveness for asynchronous programs",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "1",
  pages =        "102--113",
  month =        jan,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1594834.1480895",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:38 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Asynchronous or 'event-driven' programming is a
                 popular technique to efficiently and flexibly manage
                 concurrent interactions. In these programs, the
                 programmer can post tasks that get stored in a task
                 buffer and get executed atomically by a non-preemptive
                 scheduler at a future point. We give a decision
                 procedure for the fair termination property of
                 asynchronous programs. The fair termination problem
                 asks, given an asynchronous program and a fairness
                 condition on its executions, does the program always
                 terminate on fair executions? The fairness assumptions
                 rule out certain undesired bad behaviors, such as where
                 the scheduler ignores a set of posted tasks forever, or
                 where a non-deterministic branch is always chosen in
                 one direction. Since every liveness property reduces to
                 a fair termination property, our decision procedure
                 extends to liveness properties of asynchronous
                 programs. Our decision procedure for the fair
                 termination of asynchronous programs assumes all
                 variables are finite-state. Even though variables are
                 finite-state, asynchronous programs can have an
                 unbounded stack from recursive calls made by tasks, as
                 well as an unbounded task buffer of pending tasks. We
                 show a reduction from the fair termination problem for
                 asynchronous programs to fair termination problems on
                 Petri Nets, and our main technical result is a
                 reduction of the latter problem to Presburger
                 satisfiability. Our decidability result is in contrast
                 to multithreaded recursive programs, for which liveness
                 properties are undecidable. While we focus on fair
                 termination, we show our reduction to Petri Nets can be
                 used to prove related properties such as fair
                 nonstarvation (every posted task is eventually
                 executed) and safety properties such as boundedness
                 (find a bound on the maximum number of posted tasks
                 that can be in the task buffer at any point).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "asynchronous (event-driven) programming; fair
                 termination; liveness; Petri nets",
}

@TechReport{Granat:2009:NPQ,
  author =       "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel
                 Kressner",
  title =        "A novel parallel {$ Q R $} algorithm for hybrid
                 distributed memory {HPC} systems",
  type =         "LAPACK Working Note",
  number =       "216",
  institution =  "Department of Computing Science and HPC2N",
  address =      "Ume{\aa} University, S-901 Ume{\aa}, Sweden",
  month =        apr,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn216.pdf",
  abstract =     "A novel variant of the parallel QR algorithm for
                 solving dense nonsymmetric eigenvalue problems on
                 hybrid distributed high performance computing (HPC)
                 systems is presented. For this purpose, we introduce
                 the concept of multi-window bulge chain chasing and
                 parallelize aggressive early deflation. The
                 multi-window approach ensures that most computations
                 when chasing chains of bulges are performed in level 3
                 BLAS operations, while the aim of aggressive early
                 deflation is to speed up the convergence of the QR
                 algorithm. Mixed MPI-OpenMP coding techniques are
                 utilized for porting the codes to distributed memory
                 platforms with multithreaded nodes, such as multicore
                 processors. Numerous numerical experiments confirm the
                 superior performance of our parallel QR algorithm in
                 comparison with the existing ScaLAPACK code, leading to
                 an implementation that is one to two orders of
                 magnitude faster for sufficiently large problems,
                 including a number of examples from applications.",
  acknowledgement = ack-nhfb,
  keywords =     "aggressive early deflation; bulge chasing; Eigenvalue
                 problem; hybrid distributed memory systems.; level 3
                 performance; multishift; nonsymmetric QR algorithm;
                 parallel algorithms; parallel computations",
  utknumber =    "UMINF-09.06",
}

@Article{Grant:2009:IEE,
  author =       "Ryan E. Grant and Ahmad Afsahi",
  title =        "Improving energy efficiency of asymmetric chip
                 multithreaded multiprocessors through reduced {OS}
                 noise scheduling",
  journal =      j-CCPE,
  volume =       "21",
  number =       "18",
  pages =        "2355--2376",
  day =          "25",
  month =        dec,
  year =         "2009",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1454",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:40 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "8 Jul 2009",
}

@Article{Guz:2009:MCV,
  author =       "Zvika Guz and Evgeny Bolotin and Idit Keidar and
                 Avinoam Kolodny and Avi Mendelson and Uri C. Weiser",
  title =        "Many-Core vs. Many-Thread Machines: Stay Away From the
                 Valley",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We study the tradeoffs between Many-Core machines like
                 Intel's Larrabee and Many-Thread machines like Nvidia
                 and AMD GPGPUs. We define a unified model describing a
                 superposition of the two architectures, and use it to
                 identify operation zones for which each machine is more
                 suitable. Moreover, we identify an intermediate zone in
                 which both machines deliver inferior performance. We
                 study the shape of this ``performance valley'' and
                 provide insights on how it can be avoided.",
  acknowledgement = ack-nhfb,
  affiliation =  "Guz, Z (Reprint Author), Technion Israel Inst Technol,
                 EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar,
                 Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel
                 Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin,
                 Evgeny, Intel Corp, Santa Clara, CA 95051 USA.
                 Mendelson, Avi, Microsoft Corp, Redmond, WA 98052
                 USA.",
  author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com
                 idish@ee.technion.ac.il kolodny@ee.technion.ac.il
                 avim@microsoft.com uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Semiconductors Research Corporation (SRC);
                 Intel; Israeli Ministry of Science Knowledge Center on
                 Chip MultiProcessors",
  funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner.
                 This work was partially supported by Semiconductors
                 Research Corporation (SRC), Intel, and the Israeli
                 Ministry of Science Knowledge Center on Chip
                 MultiProcessors.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AMD GPGPU; architecture superposition; Bandwidth; Chip
                 Multiprocessors; Computer Systems; coprocessors; Delay;
                 Engines; Equations; GPGPU; Graphics; Intelpsilas
                 Larrabee; many-core machines; many-thread machines;
                 Multi-core/single-chip multiprocessors;
                 multi-threading; multiprocessing systems; Nvidia GPGPU;
                 Parallel Architectures; parallel architectures;
                 Parallel processing; performance valley; Processor
                 Architectures; Shape",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "27",
  unique-id =    "Guz:2009:MCV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hoffman:2009:SAT,
  author =       "Kevin J. Hoffman and Patrick Eugster and Suresh
                 Jagannathan",
  title =        "Semantics-aware trace analysis",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "453--464",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1542476.1542527",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As computer systems continue to become more powerful
                 and complex, so do programs. High-level abstractions
                 introduced to deal with complexity in large programs,
                 while simplifying human reasoning, can often obfuscate
                 salient program properties gleaned from automated
                 source-level analysis through subtle (often non-local)
                 interactions. Consequently, understanding the effects
                 of program changes and whether these changes violate
                 intended protocols become difficult to infer.
                 Refactorings, and feature additions, modifications, or
                 removals can introduce hard-to-catch bugs that often go
                 undetected until many revisions later.\par

                 To address these issues, this paper presents a novel
                 dynamic program analysis that builds a {\em semantic
                 view\/} of program executions. These views reflect
                 program abstractions and aspects; however, views are
                 not simply projections of execution traces, but are
                 linked to each other to capture semantic interactions
                 among abstractions at different levels of granularity
                 in a scalable manner.\par

                 We describe our approach in the context of Java and
                 demonstrate its utility to improve {\em regression
                 analysis}. We first formalize a subset of Java and a
                 grammar for traces generated at program execution. We
                 then introduce several types of views used to analyze
                 regression bugs along with a novel, scalable technique
                 for semantic differencing of traces from different
                 versions of the same program. Benchmark results on
                 large open-source Java programs demonstrate that
                 semantic-aware trace differencing can identify precise
                 and useful details about the underlying cause for a
                 regression, even in programs that use reflection,
                 multithreading, or dynamic code generation, features
                 that typically confound other analysis techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "automated debugging; debugging aids; semantic tracing;
                 testing tools; trace views; tracing",
}

@Article{Joshi:2009:RDP,
  author =       "Pallavi Joshi and Chang-Seo Park and Koushik Sen and
                 Mayur Naik",
  title =        "A randomized dynamic program analysis technique for
                 detecting real deadlocks",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "110--120",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1543135.1542489",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present a novel dynamic analysis technique that
                 finds real deadlocks in multi-threaded programs. Our
                 technique runs in two stages. In the first stage, we
                 use an imprecise dynamic analysis technique to find
                 potential deadlocks in a multi-threaded program by
                 observing an execution of the program. In the second
                 stage, we control a random thread scheduler to create
                 the potential deadlocks with high probability. Unlike
                 other dynamic analysis techniques, our approach has the
                 advantage that it does not give any false warnings. We
                 have implemented the technique in a prototype tool for
                 Java, and have experimented on a number of large
                 multi-threaded Java programs. We report a number of
                 previously known and unknown real deadlocks that were
                 found in these benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "active testing; concurrency; deadlock detection;
                 dynamic program analysis",
}

@Article{Kejariwal:2009:ELL,
  author =       "Arun Kejariwal and Alexander V. Veidenbaum and
                 Alexandru Nicolau and Milind Girkar and Xinmin Tian and
                 Hideki Saito",
  title =        "On the exploitation of loop-level parallelism in
                 embedded applications",
  journal =      j-TECS,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1457255.1457257",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Feb 5 19:15:05 MST 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Advances in the silicon technology have enabled
                 increasing support for hardware parallelism in embedded
                 processors. Vector units, multiple processors/cores,
                 multithreading, special-purpose accelerators such as
                 DSPs or cryptographic engines, or a combination of the
                 above have appeared in a number of processors. They
                 serve to address the increasing performance
                 requirements of modern embedded applications. To what
                 extent the available hardware parallelism can be
                 exploited is directly dependent on the amount of
                 parallelism inherent in the given application and the
                 congruence between the granularity of hardware and
                 application parallelism. This paper discusses how
                 loop-level parallelism in embedded applications can be
                 exploited in hardware and software. Specifically, it
                 evaluates the efficacy of automatic loop
                 parallelization and the performance potential of
                 different types of parallelism, viz., true thread-level
                 parallelism (TLP), speculative thread-level parallelism
                 and vector parallelism, when executing loops.
                 Additionally, it discusses the interaction between
                 parallelization and vectorization. Applications from
                 both the industry-standard EEMBC{\reg},$^1$ 1.1, EEMBC
                 2.0 and the academic MiBench embedded benchmark suites
                 are analyzed using the Intel{\reg}$^2$ C compiler. The
                 results show the performance that can be achieved today
                 on real hardware and using a production compiler,
                 provide upper bounds on the performance potential of
                 the different types of thread-level parallelism, and
                 point out a number of issues that need to be addressed
                 to improve performance. The latter include
                 parallelization of libraries such as libc and design of
                 parallel algorithms to allow maximal exploitation of
                 parallelism. The results also point to the need for
                 developing new benchmark suites more suitable to
                 parallel compilation and execution.\par

                 $^1$ Other names and brands may be claimed as the
                 property of others.\par

                 $^2$ Intel is a trademark of Intel Corporation or its
                 subsidiaries in the United States and other
                 countries.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
  keywords =     "libraries; Multi-cores; multithreading; parallel
                 loops; programming models; system-on-chip (Soc);
                 thread-level speculation; vectorization",
}

@Article{Kejariwal:2009:PSA,
  author =       "Arun Kejariwal and Calin Cas{\c{c}}aval",
  title =        "Parallelization spectroscopy: analysis of thread-level
                 parallelism in {HPC} programs",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "293--294",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1594835.1504221",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper, we present a method --- parallelization
                 spectroscopy --- for analyzing the thread-level
                 parallelism available in production High Performance
                 Computing (HPC) codes. We survey a number of techniques
                 that are commonly used for parallelization and classify
                 all the loops in the case study presented using a
                 sensitivity metric: how likely is a particular
                 technique is successful in parallelizing the loop.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "loop transformations; multithreading; parallelism",
}

@Article{Kunal:2009:HDS,
  author =       "K. Kunal and K. George and M. Gautam and V. Kamakoti",
  title =        "{HTM} design spaces: complete decoupling from caches
                 and achieving highly concurrent transactions",
  journal =      j-OPER-SYS-REV,
  volume =       "43",
  number =       "2",
  pages =        "98--99",
  month =        apr,
  year =         "2009",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1531793.1531809",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Apr 23 19:43:22 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper proposes a Hardware Transactional Memory
                 (HTM) design for multi-core environments. Using a novel
                 technique to keep track of transactional read-write
                 entries, the design provides a holistic and scalable
                 solution to Transactional Memory (TM) implementation
                 issues of context switching, process migration and
                 overflow handling. Another aspect of the design is that
                 it allows transactions to run in a highly concurrent
                 manner by using special techniques to handle conflict
                 resolution, conflict detection and overflows. The
                 feasibility and validity of the proposed design are
                 demonstrated by developing a synthesizable Hardware
                 Description Language (HDL) model of the design and also
                 experimenting on the same with standard benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
  keywords =     "context switching; hardware transactional memory;
                 multi-threaded cores; operating systems; overflow
                 handling; process migration",
}

@TechReport{Kurzak:2009:SLA,
  author =       "Jakub Kurzak and Hatem Ltaief and Jack Dongarra and
                 Rosa M. Badia",
  title =        "Scheduling Linear Algebra Operations on Multicore
                 Processors",
  type =         "LAPACK Working Note",
  number =       "213",
  institution =  inst-UT-CS,
  address =      inst-UT-CS:adr,
  month =        feb,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn213.pdf",
  abstract =     "We present performance results for dense linear
                 algebra using the 8-series NVIDIA GPUs. Our
                 matrix-matrix multiply routine (GEMM) runs 60\% faster
                 than the vendor implementation in CUBLAS 1.1 and
                 approaches the peak of hardware capabilities. Our LU,
                 QR and Cholesky factorizations achieve up to 80--90\%
                 of the peak GEMM rate. Our parallel LU running on two
                 GPUs achieves up to $ \approx $300 Gflop/s. These
                 results are accomplished by challenging the accepted
                 view of the GPU architecture and

                 programming guidelines. We argue that modern GPUs
                 should be viewed as multithreaded multicore vector
                 units. We exploit blocking similarly to vector
                 computers and heterogeneity of the system by computing
                 both on GPU and CPU. This study includes detailed
                 benchmarking of the GPU memory system that reveals
                 sizes and latencies of caches and TLB. We present a
                 couple of algorithmic optimizations aimed at increasing
                 parallelism and regularity in the problem that provide
                 us with slightly higher performance.",
  acknowledgement = ack-nhfb,
  keywords =     "Cholesky; factorization; linear algebra; LU;
                 multicore; QR; scheduling; task graph",
  utknumber =    "UT-CS-09-636",
}

@Article{Lee:2009:MHF,
  author =       "Taehee Lee and Tobias H{\"o}llerer",
  title =        "Multithreaded Hybrid Feature Tracking for Markerless
                 Augmented Reality",
  journal =      j-IEEE-TRANS-VIS-COMPUT-GRAPH,
  volume =       "15",
  number =       "3",
  pages =        "355--368",
  month =        may # "\slash " # jun,
  year =         "2009",
  CODEN =        "ITVGEA",
  DOI =          "https://doi.org/10.1109/TVCG.2008.190",
  ISSN =         "1077-2626 (print), 1941-0506 (electronic), 2160-9306",
  ISSN-L =       "1077-2626",
  bibdate =      "Thu Jul 2 10:22:33 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Visualization and Computer
                 Graphics",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
}

@Article{Lenharth:2009:RDO,
  author =       "Andrew Lenharth and Vikram S. Adve and Samuel T.
                 King",
  title =        "Recovery domains: an organizing principle for
                 recoverable operating systems",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "49--60",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508284.1508251",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We describe a strategy for enabling existing commodity
                 operating systems to recover from unexpected run-time
                 errors in nearly any part of the kernel, including core
                 kernel components. Our approach is dynamic and
                 request-oriented; it isolates the effects of a fault to
                 the requests that caused the fault rather than to
                 static kernel components. This approach is based on a
                 notion of 'recovery domains,' an organizing principle
                 to enable rollback of state affected by a request in a
                 multithreaded system with minimal impact on other
                 requests or threads. We have applied this approach on
                 v2.4.22 and v2.6.27 of the Linux kernel and it required
                 132 lines of changed or new code: the other changes are
                 all performed by a simple instrumentation pass of a
                 compiler. Our experiments show that the approach is
                 able to recover from otherwise fatal faults with
                 minimal collateral impact during a recovery event.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "akeso; automatic fault recovery; recovery domains",
}

@Article{Lubbers:2009:RMP,
  author =       "Enno L{\"u}bbers and Marco Platzner",
  title =        "{ReconOS}: {Multithreaded} programming for
                 reconfigurable computers",
  journal =      j-TECS,
  volume =       "9",
  number =       "1",
  pages =        "8:1--8:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Mon Mar 15 18:40:57 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
}

@Article{Madriles:2009:BST,
  author =       "Carlos Madriles and Pedro L{\'o}pez and Josep M.
                 Codina and Enric Gibert and Fernando Latorre and
                 Alejandro Martinez and Ra{\'u}l Martinez and Antonio
                 Gonzalez",
  title =        "Boosting single-thread performance in multi-core
                 systems through fine-grain multi-threading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "474--483",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555813",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Industry has shifted towards multi-core designs as we
                 have hit the memory and power walls. However, single
                 thread performance remains of paramount importance
                 since some applications have limited thread-level
                 parallelism (TLP), and even a small part with limited
                 TLP impose important constraints to the global
                 performance, as explained by Amdahl's law.\par

                 In this paper we propose a novel approach for
                 leveraging multiple cores to improve single-thread
                 performance in a multi-core design. The proposed
                 technique features a set of novel hardware mechanisms
                 that support the execution of threads generated at
                 compile time. These threads result from a fine-grain
                 speculative decomposition of the original application
                 and they are executed under a modified multi-core
                 system that includes: (1) mechanisms to support
                 multiple versions; (2) mechanisms to detect violations
                 among threads; (3) mechanisms to reconstruct the
                 original sequential order; and (4) mechanisms to
                 checkpoint the architectural state and recovery to
                 handle misspeculations.\par

                 The proposed scheme outperforms previous hardware-only
                 schemes to implement the idea of combining cores for
                 executing single-thread applications in a multi-core
                 design by more than 10\% on average on Spec2006 for all
                 configurations. Moreover, single-thread performance is
                 improved by 41\% on average when the proposed scheme is
                 used on a Tiny Core, and up to 2.6x for some selected
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "automatic parallelization; core-fusion; multicore;
                 single-thread performance; speculative multithreading;
                 thread-level parallelism",
}

@Article{Marino:2009:LES,
  author =       "Daniel Marino and Madanlal Musuvathi and Satish
                 Narayanasamy",
  title =        "{LiteRace}: effective sampling for lightweight
                 data-race detection",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "134--143",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1542476.1542491",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Data races are one of the most common and subtle
                 causes of pernicious concurrency bugs. Static
                 techniques for preventing data races are overly
                 conservative and do not scale well to large programs.
                 Past research has produced several dynamic data race
                 detectors that can be applied to large programs. They
                 are precise in the sense that they only report actual
                 data races. However, dynamic data race detectors incur
                 a high performance overhead, slowing down a program's
                 execution by an order of magnitude.\par

                 In this paper we present LiteRace, a very lightweight
                 data race detector that samples and analyzes only
                 selected portions of a program's execution. We show
                 that it is possible to sample a multithreaded program
                 at a low frequency, and yet, find infrequently
                 occurring data races. We implemented LiteRace using
                 Microsoft's Phoenix compiler. Our experiments with
                 several Microsoft programs, Apache, and Firefox show
                 that LiteRace is able to find more than 70\% of data
                 races by sampling less than 2\% of memory accesses in a
                 given program execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency bugs; dynamic data race detection;
                 sampling",
}

@Article{Monchiero:2009:HSC,
  author =       "Matteo Monchiero and Jung Ho Ahn and Ayose Falc{\'o}n
                 and Daniel Ortega and Paolo Faraboschi",
  title =        "How to simulate 1000 cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "10--19",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577133",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper proposes a novel methodology to efficiently
                 simulate shared-memory multiprocessors composed of
                 hundreds of cores. The basic idea is to use
                 thread-level parallelism in the software system and
                 translate it into core-level parallelism in the
                 simulated world. To achieve this, we first augment an
                 existing full-system simulator to identify and separate
                 the instruction streams belonging to the different
                 software threads. Then, the simulator dynamically maps
                 each instruction flow to the corresponding core of the
                 target multi-core architecture, taking into account the
                 inherent thread synchronization of the running
                 applications. Our simulator allows a user to execute
                 any multithreaded application in a conventional
                 full-system simulator and evaluate the performance of
                 the application on a many-core hardware. We carried out
                 extensive simulations on the SPLASH-2 benchmark suite
                 and demonstrated the scalability up to 1024 cores with
                 limited simulation speed degradation vs. the
                 single-core case on a fixed workload. The results also
                 show that the proposed technique captures the intrinsic
                 behavior of the SPLASH-2 suite, even when we scale up
                 the number of shared-memory cores beyond the
                 thousand-core limit.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Mukherjee:2009:PAS,
  author =       "Jayanta Mukherjee and Soumyendu Raha",
  title =        "Power-aware Speed-up for Multithreaded Numerical
                 Linear Algebraic Solvers on Chip Multicore Processors",
  journal =      j-SCPE,
  volume =       "10",
  number =       "2",
  pages =        "217--228",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1895-1767",
  bibdate =      "Thu Sep 2 11:55:11 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.scpe.org/content/10/2.toc",
  URL =          "http://www.scpe.org/vols/vol10/no2/SCPE_10_2_07.pdf;
                 http://www.scpe.org/vols/vol10/no2/SCPE_10_2_07.zip",
  acknowledgement = ack-nhfb,
}

@Article{Musoll:2009:LSO,
  author =       "Enric Musoll",
  title =        "Leakage-saving opportunities in mesh-based massive
                 multi-core architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "5",
  pages =        "1--7",
  month =        dec,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1755235.1755237",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Apr 8 18:42:25 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "When processing multi-threaded workloads requiring
                 significant inter-thread communication, opportunities
                 to reduce power consumption arise due to the large
                 latencies in obtaining data from the threads running on
                 remote cores and the lack of architectural resources
                 implemented in the simple cores to cover these
                 latencies.\par

                 In this work we propose to use the drowsy mode
                 technique to save leakage power on the cores and
                 leverage the mesh-based communication fabric to hide
                 the wake-up latency of the core blocks. We have
                 observed a potential for reducing the overall power of
                 around 70\% in a generic homogeneous 256-core
                 tile-based multi-core architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Neamtiu:2009:STU,
  author =       "Iulian Neamtiu and Michael Hicks",
  title =        "Safe and timely updates to multi-threaded programs",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "13--24",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1543135.1542479",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Many dynamic updating systems have been developed that
                 enable a program to be patched while it runs, to fix
                 bugs or add new features. This paper explores
                 techniques for supporting dynamic updates to
                 multi-threaded programs, focusing on the problem of
                 applying an update in a timely fashion while still
                 producing correct behavior. Past work has shown that
                 this tension of {\em safety\/} versus timeliness can be
                 balanced for single-threaded programs. For
                 multi-threaded programs, the task is more difficult
                 because myriad thread interactions complicate
                 understanding the possible program states to which a
                 patch could be applied. Our approach allows the
                 programmer to specify a few program points (e.g., one
                 per thread) at which a patch may be applied, which
                 simplifies reasoning about safety. To improve
                 timeliness, a combination of static analysis and
                 run-time support automatically expands these few points
                 to many more that produce behavior equivalent to the
                 originals. Experiments with thirteen realistic updates
                 to three multi-threaded servers show that we can safely
                 perform a dynamic update within milliseconds when more
                 straightforward alternatives would delay some updates
                 indefinitely.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "dynamic software updating; multi-threading; update
                 safety; update timeliness",
}

@Article{Nicolau:2009:TEP,
  author =       "Alexandru Nicolau and Guangqiang Li and Arun
                 Kejariwal",
  title =        "Techniques for efficient placement of synchronization
                 primitives",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "199--208",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504207",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Harnessing the hardware parallelism of the emerging
                 multi-cores systems necessitates concurrent software.
                 Unfortunately, most of the existing mainstream software
                 is sequential in nature. Although one could
                 auto-parallelize a given program, the efficacy of this
                 is largely limited to floating-point codes. One of the
                 ways to alleviate the above limitation is to
                 parallelize programs, which cannot be
                 auto-parallelized, via explicit synchronization. In
                 this regard, efficient placement of the synchronization
                 primitives --- say, post, wait --- plays a key role in
                 achieving high degree of thread-level parallelism ({\em
                 TLP\/}). In this paper, we propose novel compiler
                 techniques for the above. Specifically, given a control
                 flow graph ({\em CFG\/}), the proposed techniques place
                 a post as early as possible and place a wait as late as
                 possible in the CFG, subject to dependences. We
                 demonstrate the efficacy of our techniques, on a real
                 machine, using real codes, specifically, from the
                 industry-standard SPEC CPU benchmarks, the Linux kernel
                 and other widely used open source codes. Our results
                 show that the proposed techniques yield significantly
                 higher levels of TLP than the state-of-the-art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "compilers; multithreading; parallelization;
                 performance",
}

@Article{Olszewski:2009:KED,
  author =       "Marek Olszewski and Jason Ansel and Saman
                 Amarasinghe",
  title =        "{Kendo}: efficient deterministic multithreading in
                 software",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "97--108",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508244.1508256",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Although chip-multiprocessors have become the industry
                 standard, developing parallel applications that target
                 them remains a daunting task. Non-determinism, inherent
                 in threaded applications, causes significant challenges
                 for parallel programmers by hindering their ability to
                 create parallel applications with repeatable results.
                 As a consequence, parallel applications are
                 significantly harder to debug, test, and maintain than
                 sequential programs.\par

                 This paper introduces Kendo: a new software-only system
                 that provides deterministic multithreading of parallel
                 applications. Kendo enforces a deterministic
                 interleaving of lock acquisitions and specially
                 declared non-protected reads through a novel
                 dynamically load-balanced deterministic scheduling
                 algorithm. The algorithm tracks the progress of each
                 thread using performance counters to construct a
                 deterministic logical time that is used to compute an
                 interleaving of shared data accesses that is both
                 deterministic and provides good load balancing. Kendo
                 can run on today's commodity hardware while incurring
                 only a modest performance cost. Experimental results on
                 the SPLASH-2 applications yield a geometric mean
                 overhead of only 16\% when running on 4 processors.
                 This low overhead makes it possible to benefit from
                 Kendo even after an application is deployed.
                 Programmers can start using Kendo today to program
                 parallel applications that are easier to develop,
                 debug, and test.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "debugging; determinism; deterministic multithreading;
                 multicore; parallel programming",
}

@Article{Pichel:2009:IDR,
  author =       "J. C. Pichel and D. B. Heras and J. C. Cabaleiro and
                 F. F. Rivera",
  title =        "Increasing data reuse of sparse algebra codes on
                 simultaneous multithreading architectures",
  journal =      j-CCPE,
  volume =       "21",
  number =       "15",
  pages =        "1838--1856",
  month =        oct,
  year =         "2009",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1404",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:38 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "11 Feb 2009",
}

@Article{Piringer:2009:MTA,
  author =       "Harald Piringer and Christian Tominski and Philipp
                 Muigg and Wolfgang Berger",
  title =        "A Multi-Threading Architecture to Support Interactive
                 Visual Exploration",
  journal =      j-IEEE-TRANS-VIS-COMPUT-GRAPH,
  volume =       "15",
  number =       "6",
  pages =        "1113--1120",
  month =        nov # "\slash " # dec,
  year =         "2009",
  CODEN =        "ITVGEA",
  DOI =          "https://doi.org/10.1109/TVCG.2009.110",
  ISSN =         "1077-2626 (print), 1941-0506 (electronic), 2160-9306",
  ISSN-L =       "1077-2626",
  bibdate =      "Thu May 13 17:38:49 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Visualization and Computer
                 Graphics",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
}

@Article{Quintana-Orti:2009:PMA,
  author =       "Gregorio Quintana-Ort{\'\i} and Enrique S.
                 Quintana-Ort{\'\i} and Robert A. {Van De Geijn} and
                 Field G. {Van Zee} and Ernie Chan",
  title =        "Programming matrix algorithms-by-blocks for
                 thread-level parallelism",
  journal =      j-TOMS,
  volume =       "36",
  number =       "3",
  pages =        "14:1--14:26",
  month =        jul,
  year =         "2009",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1527286.1527288",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Tue Jul 21 14:09:07 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the emergence of thread-level parallelism as the
                 primary means for continued performance improvement,
                 the programmability issue has reemerged as an obstacle
                 to the use of architectural advances. We argue that
                 evolving legacy libraries for dense and banded linear
                 algebra is not a viable solution due to constraints
                 imposed by early design decisions. We propose a
                 philosophy of abstraction and separation of concerns
                 that provides a promising solution in this problem
                 domain. The first abstraction, FLASH, allows algorithms
                 to express computation with matrices consisting of
                 contiguous blocks, facilitating algorithms-by-blocks.
                 Operand descriptions are registered for a particular
                 operation a priori by the library implementor. A
                 runtime system, SuperMatrix, uses this information to
                 identify data dependencies between suboperations,
                 allowing them to be scheduled to threads out-of-order
                 and executed in parallel. But not all classical
                 algorithms in linear algebra lend themselves to
                 conversion to algorithms-by-blocks. We show how our
                 recently proposed LU factorization with incremental
                 pivoting and a closely related algorithm-by-blocks for
                 the QR factorization, both originally designed for
                 out-of-core computation, overcome this difficulty.
                 Anecdotal evidence regarding the development of
                 routines with a core functionality demonstrates how the
                 methodology supports high productivity while
                 experimental results suggest that high performance is
                 abundantly achievable.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "high-performance; libraries; Linear algebra;
                 multithreaded architectures",
}

@Article{Raghavan:2009:DLC,
  author =       "P. Raghavan and A. Lambrechts and M. Jayapala and F.
                 Catthoor and D. Verkest",
  title =        "Distributed Loop Controller for Multithreading in
                 Unithreaded {ILP} Architectures",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "58",
  number =       "3",
  pages =        "311--321",
  month =        mar,
  year =         "2009",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2008.168",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon Jul 4 11:37:40 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4624249",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Ratanaworabhan:2009:DTA,
  author =       "Paruj Ratanaworabhan and Martin Burtscher and Darko
                 Kirovski and Benjamin Zorn and Rahul Nagpal and Karthik
                 Pattabiraman",
  title =        "Detecting and tolerating asymmetric races",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "173--184",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504202",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper introduces ToleRace, a runtime system that
                 allows programs to detect and even tolerate asymmetric
                 data races. Asymmetric races are race conditions where
                 one thread correctly acquires and releases a lock for a
                 shared variable while another thread improperly
                 accesses the same variable. ToleRace provides
                 approximate isolation in the critical sections of
                 lock-based parallel programs by creating a local copy
                 of each shared variable when entering a critical
                 section, operating on the local copies, and propagating
                 the appropriate copies upon leaving the critical
                 section. We start by characterizing all possible
                 interleavings that can cause races and precisely
                 describe the effect of ToleRace in each case. Then, we
                 study the theoretical aspects of an oracle that knows
                 exactly what type of interleaving has occurred.
                 Finally, we present two software implementations of
                 ToleRace and evaluate them on multithreaded
                 applications from the SPLASH2 and PARSEC suites. Our
                 implementation on top of a dynamic instrumentation
                 tool, which works directly on executables and requires
                 no source code modifications, incurs an overhead of a
                 factor of two on average. Manually adding ToleRace to
                 the source code of these applications results in an
                 average overhead of 6.4 percent.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "dynamic instrumentation; race detection and
                 toleration; runtime support",
}

@Article{Riccobene:2009:SCB,
  author =       "Elvinia Riccobene and Patrizia Scandurra and Sara
                 Bocchio and Alberto Rosti and Luigi Lavazza and Luigi
                 Mantellini",
  title =        "{SystemC\slash C-based} model-driven design for
                 embedded systems",
  journal =      j-TECS,
  volume =       "8",
  number =       "4",
  pages =        "30:1--30:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1550987.1550993",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jul 23 12:32:49 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This article summarizes our effort, since 2004 up to
                 the present time, for improving the current industrial
                 Systems-on-Chip and Embedded Systems design by joining
                 the capabilities of the unified modeling language (UML)
                 and SystemC/C programming languages to operate at
                 system-level. The proposed approach exploits the OMG
                 model-driven architecture --- a framework for
                 Model-driven Engineering --- capabilities of reducing
                 abstract, coarse-grained and platform-independent
                 system models to fine-grained and platform-specific
                 models. We first defined a design methodology and a
                 development flow for the hardware, based on a SystemC
                 UML profile and encompassing different levels of
                 abstraction. We then included a multithread C UML
                 profile for modelling software applications. Both
                 SystemC/C profiles are consistent sets of modelling
                 constructs designed to lift the programming features
                 (both structural and behavioral) of the two coding
                 languages to the UML modeling level. The new codesign
                 flow is supported by an environment, which allows
                 system modeling at higher abstraction levels (from a
                 functional executable level to a register transfer
                 level) and supports automatic
                 code-generation/back-annotation from/to UML models.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
  keywords =     "C; ES; MDE; SoC; SystemC; UML",
}

@Article{Roy:2009:LPF,
  author =       "Indrajit Roy and Donald E. Porter and Michael D. Bond
                 and Kathryn S. McKinley and Emmett Witchel",
  title =        "{Laminar}: practical fine-grained decentralized
                 information flow control",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "6",
  pages =        "63--74",
  month =        jun,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1543135.1542484",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:41:16 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Decentralized information flow control (DIFC) is a
                 promising model for writing programs with powerful,
                 end-to-end security guarantees. Current DIFC systems
                 that run on commodity hardware can be broadly
                 categorized into two types: language-level and
                 operating system-level DIFC. Language level solutions
                 provide no guarantees against security violations on
                 system resources, like files and sockets. Operating
                 system solutions can mediate accesses to system
                 resources, but are inefficient at monitoring the flow
                 of information through fine-grained program data
                 structures.\par

                 This paper describes Laminar, the first system to
                 implement decentralized information flow control using
                 a single set of abstractions for OS resources and
                 heap-allocated objects. Programmers express security
                 policies by labeling data with secrecy and integrity
                 labels, and then access the labeled data in lexically
                 scoped security regions. Laminar enforces the security
                 policies specified by the labels at runtime. Laminar is
                 implemented using a modified Java virtual machine and a
                 new Linux security module. This paper shows that
                 security regions ease incremental deployment and limit
                 dynamic security checks, allowing us to retrofit DIFC
                 policies on four application case studies. Replacing
                 the applications' ad-hoc security policies changes less
                 than 10\% of the code, and incurs performance overheads
                 from 1\% to 56\%. Whereas prior DIFC systems only
                 support limited types of multithreaded programs,
                 Laminar supports a more general class of multithreaded
                 DIFC programs that can access heterogeneously labeled
                 data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "information flow control; java virtual machine;
                 operating systems; security region",
}

@Article{Sidiroglou:2009:AAS,
  author =       "Stelios Sidiroglou and Oren Laadan and Carlos Perez
                 and Nicolas Viennot and Jason Nieh and Angelos D.
                 Keromytis",
  title =        "{ASSURE}: automatic software self-healing using rescue
                 points",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "37--48",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508284.1508250",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Software failures in server applications are a
                 significant problem for preserving system availability.
                 We present ASSURE, a system that introduces rescue
                 points that recover software from unknown faults while
                 maintaining both system integrity and availability, by
                 mimicking system behavior under known error conditions.
                 Rescue points are locations in existing application
                 code for handling a given set of programmer-anticipated
                 failures, which are automatically repurposed and tested
                 for safely enabling fault recovery from a larger class
                 of (unanticipated) faults. When a fault occurs at an
                 arbitrary location in the program, ASSURE restores
                 execution to an appropriate rescue point and induces
                 the program to recover execution by virtualizing the
                 program's existing error-handling facilities. Rescue
                 points are identified using fuzzing, implemented using
                 a fast coordinated checkpoint-restart mechanism that
                 handles multi-process and multi-threaded applications,
                 and, after testing, are injected into production code
                 using binary patching. We have implemented an ASSURE
                 Linux prototype that operates without application
                 source code and without base operating system kernel
                 changes. Our experimental results on a set of
                 real-world server applications and bugs show that
                 ASSURE enabled recovery for all of the bugs tested with
                 fast recovery times, has modest performance overhead,
                 and provides automatic self-healing orders of magnitude
                 faster than current human-driven patch deployment
                 methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "binary patching; checkpoint restart; error recovery;
                 reliable software; software self-healing",
}

@Article{Son:2009:CDD,
  author =       "Seung Woo Son and Mahmut Kandemir and Mustafa Karakoy
                 and Dhruva Chakrabarti",
  title =        "A compiler-directed data prefetching scheme for chip
                 multiprocessors",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "209--218",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504208",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Data prefetching has been widely used in the past as a
                 technique for hiding memory access latencies. However,
                 data prefetching in multi-threaded applications running
                 on chip multiprocessors (CMPs) can be problematic when
                 multiple cores compete for a shared on-chip cache (L2
                 or L3). In this paper, we (i) quantify the impact of
                 conventional data prefetching on shared caches in CMPs.
                 The experimental data collected using multi-threaded
                 applications indicates that, while data prefetching
                 improves performance in small number of cores, its
                 benefits reduce significantly as the number of cores is
                 increased, that is, it is not scalable; (ii) identify
                 harmful prefetches as one of the main contributors for
                 degraded performance with a large number of cores; and
                 (iii) propose and evaluate a compiler-directed data
                 prefetching scheme for shared on-chip cache based CMPs.
                 The proposed scheme first identifies program phases
                 using static compiler analysis, and then divides the
                 threads into groups within each phase and assigns a
                 customized prefetcher thread (helper thread) to each
                 group of threads. This helps to reduce the total number
                 of prefetches issued, prefetch overheads, and negative
                 interactions on the shared cache space due to data
                 prefetches, and more importantly, makes
                 compiler-directed prefetching a scalable optimization
                 for CMPs. Our experiments with the applications from
                 the SPEC OMP benchmark suite indicate that the proposed
                 scheme improves overall parallel execution latency by
                 18.3\% over the no-prefetch case and 6.4\% over the
                 conventional data prefetching scheme (where each core
                 prefetches its data independently), on average, when 12
                 cores are used. The corresponding average performance
                 improvements with 24 cores are 16.4\% (over the
                 no-prefetch case) and 11.7\% (over the conventional
                 prefetching case). We also demonstrate that the
                 proposed scheme is robust under a wide range of values
                 of our major simulation parameters, and the
                 improvements it achieves come very close to those that
                 can be achieved using an optimal scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "chip multiprocessors; compiler; helper thread;
                 prefetching",
}

@Article{Suleman:2009:ACS,
  author =       "M. Aater Suleman and Onur Mutlu and Moinuddin K.
                 Qureshi and Yale N. Patt",
  title =        "Accelerating critical section execution with
                 asymmetric multi-core architectures",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "3",
  pages =        "253--264",
  month =        mar,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1508244.1508274",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jun 16 14:39:26 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To improve the performance of a single application on
                 Chip Multiprocessors (CMPs), the application must be
                 split into threads which execute concurrently on
                 multiple cores. In multi-threaded applications,
                 critical sections are used to ensure that only one
                 thread accesses shared data at any given time. Critical
                 sections can serialize the execution of threads, which
                 significantly reduces performance and
                 scalability.\par

                 This paper proposes Accelerated Critical Sections
                 (ACS), a technique that leverages the high-performance
                 core(s) of an Asymmetric Chip Multiprocessor (ACMP) to
                 accelerate the execution of critical sections. In ACS,
                 selected critical sections are executed by a
                 high-performance core, which can execute the critical
                 section faster than the other, smaller cores. As a
                 result, ACS reduces serialization: it lowers the
                 likelihood of threads waiting for a critical section to
                 finish. Our evaluation on a set of 12
                 critical-section-intensive workloads shows that ACS
                 reduces the average execution time by 34\% compared to
                 an equal-area 32T-core symmetric CMP and by 23\%
                 compared to an equal-area ACMP. Moreover, for 7 out of
                 the 12 workloads, ACS improves scalability by
                 increasing the number of threads at which performance
                 saturates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cmp; critical sections; heterogeneous cores; locks;
                 multi-core; parallel programming",
}

@Book{Swinnen:2009:APA,
  author =       "G{\'e}rard Swinnen",
  title =        "Apprendre {\'a} programmer avec Python: objet,
                 multithreading, {\'e}v{\'e}nements, bases de
                 donn{\'e}es, programmation web, programmation
                 r{\'e}seau, Unicode",
  publisher =    pub-EYROLLES,
  address =      pub-EYROLLES:adr,
  pages =        "xviii + 341",
  year =         "2009",
  LCCN =         "????",
  bibdate =      "Thu Apr 16 12:00:29 MDT 2009",
  bibsource =    "carmin.sudoc.abes.fr:210/ABES-Z39-PUBLIC;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  language =     "French",
}

@Article{Tallent:2009:EPM,
  author =       "Nathan R. Tallent and John M. Mellor-Crummey",
  title =        "Effective performance measurement and analysis of
                 multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "4",
  pages =        "229--240",
  month =        apr,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1504176.1504210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:49 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Understanding why the performance of a multithreaded
                 program does not improve linearly with the number of
                 cores in a shared-memory node populated with one or
                 more multicore processors is a problem of growing
                 practical importance. This paper makes three
                 contributions to performance analysis of multithreaded
                 programs. First, we describe how to measure and
                 attribute {\em parallel idleness}, namely, where
                 threads are stalled and unable to work. This technique
                 applies broadly to programming models ranging from
                 explicit threading ({\em e.g.}, Pthreads) to
                 higher-level models such as Cilk and OpenMP. Second, we
                 describe how to measure and attribute {\em parallel
                 overhead\/} -- when a thread is performing
                 miscellaneous work other than executing the user's
                 computation. By employing a combination of compiler
                 support and post-mortem analysis, we incur no
                 measurement cost beyond normal profiling to glean this
                 information. Using {\em idleness\/} and {\em
                 overhead\/} metrics enables one to pinpoint areas of an
                 application where concurrency should be increased (to
                 reduce idleness), decreased (to reduce overhead), or
                 where the present parallelization is hopeless (where
                 idleness and overhead are both high). Third, we
                 describe how to measure and attribute arbitrary
                 performance metrics for high-level multithreaded
                 programming models, such as Cilk. This requires
                 bridging the gap between the expression of logical
                 concurrency in programs and its realization at run-time
                 as it is adaptively partitioned and scheduled onto a
                 pool of threads. We have prototyped these ideas in the
                 context of Rice University's HPCToolkit performance
                 tools. We describe our approach, implementation, and
                 experiences applying this approach to measure and
                 attribute work, idleness, and overhead in executions of
                 Cilk programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "call path profiling; hpctoolkit; multithreaded
                 programming models; performance analysis",
}

@Article{Thakur:2009:TSE,
  author =       "Rajeev Thakur and William Gropp",
  title =        "Test suite for evaluating performance of multithreaded
                 {MPI} communication",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "35",
  number =       "12",
  pages =        "608--617",
  month =        dec,
  year =         "2009",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:11 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/01678191",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Vander-Swalmen:2009:CAM,
  author =       "Pascal Vander-Swalmen and Gilles Dequen and
                 Micha{\"e}l Krajecki",
  title =        "A Collaborative Approach for Multi-Threaded {SAT}
                 Solving",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "37",
  number =       "3",
  pages =        "324--342",
  month =        jun,
  year =         "2009",
  CODEN =        "IJPPE5",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Wed Sep 1 16:06:47 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=324",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Vera:2009:SRL,
  author =       "Xavier Vera and Jaume Abella and Javier Carretero and
                 Antonio Gonz{\'a}lez",
  title =        "Selective replication: a lightweight technique for
                 soft errors",
  journal =      j-TOCS,
  volume =       "27",
  number =       "4",
  pages =        "8:1--8:30",
  month =        dec,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1658357.1658359",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Mar 15 09:06:46 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Soft errors are an important challenge in contemporary
                 microprocessors. Modern processors have caches and
                 large memory arrays protected by parity or error
                 detection and correction codes. However, today's
                 failure rate is dominated by flip flops, latches, and
                 the increasing sensitivity of combinational logic to
                 particle strikes. Moreover, as Chip Multi-Processors
                 (CMPs) become ubiquitous, meeting the FIT budget for
                 new designs is becoming a major
                 challenge.\par

                 Solutions based on replicating threads have been
                 explored deeply; however, their high cost in
                 performance and energy make them unsuitable for current
                 designs. Moreover, our studies based on a typical
                 configuration for a modern processor show that focusing
                 on the top 5 most vulnerable structures can provide up
                 to 70\% reduction in FIT rate. Therefore, full
                 replication may overprotect the chip by reducing the
                 FIT much below budget.\par

                 We propose {\em Selective Replication}, a
                 lightweight-reconfigurable mechanism that achieves a
                 high FIT reduction by protecting the most vulnerable
                 instructions with minimal performance and energy
                 impact. Low performance degradation is achieved by not
                 requiring additional issue slots and reissuing
                 instructions only during the time window between when
                 they are retirable and they actually retire. Coverage
                 can be reconfigured online by replicating only a subset
                 of the instructions (the most vulnerable ones).
                 Instructions' vulnerability is estimated based on the
                 area they occupy and the time they spend in the issue
                 queue. By changing the vulnerability threshold, we can
                 adjust the trade-off between coverage and performance
                 loss.\par

                 Results for an out-of-order processor configured
                 similarly to Intel{\reg} Core\TM{} Micro-Architecture
                 show that our scheme can achieve over 65\% FIT
                 reduction with less than 4\% performance degradation
                 with small area and complexity overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "AVF prediction; FIT reduction; redundant
                 multithreading; Soft errors",
}

@Article{Wang:2009:TDA,
  author =       "Yin Wang and St{\'e}phane Lafortune and Terence Kelly
                 and Manjunath Kudlur and Scott Mahlke",
  title =        "The theory of deadlock avoidance via discrete
                 control",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "1",
  pages =        "252--263",
  month =        jan,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1480881.1480913",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 9 08:40:38 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Deadlock in multithreaded programs is an increasingly
                 important problem as ubiquitous multicore architectures
                 force parallelization upon an ever wider range of
                 software. This paper presents a theoretical foundation
                 for dynamic deadlock avoidance in concurrent programs
                 that employ conventional mutual exclusion and
                 synchronization primitives (e.g., multithreaded
                 C/Pthreads programs). Beginning with control flow
                 graphs extracted from program source code, we construct
                 a formal model of the program and then apply Discrete
                 Control Theory to automatically synthesize
                 deadlock-avoidance control logic that is implemented by
                 program instrumentation. At run time, the control logic
                 avoids deadlocks by postponing lock acquisitions.
                 Discrete Control Theory guarantees that the program
                 instrumented with our synthesized control logic cannot
                 deadlock. Our method furthermore guarantees that the
                 control logic is maximally permissive: it postpones
                 lock acquisitions only when necessary to prevent
                 deadlocks, and therefore permits maximal runtime
                 concurrency. Our prototype for C/Pthreads scales to
                 real software including Apache, OpenLDAP, and two kinds
                 of benchmarks, automatically avoiding both injected and
                 naturally occurring deadlocks while imposing modest
                 runtime overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrent programming; discrete control theory;
                 dynamic deadlock avoidance; multicore processors;
                 multithreaded programming; parallel programming",
}

@Article{Youseff:2009:PES,
  author =       "Lamia Youseff and Keith Seymour and Haihang You and
                 Dmitrii Zagorodnov and Jack Dongarra and Rich Wolski",
  title =        "Paravirtualization effect on single-and multi-threaded
                 memory-intensive linear algebra software",
  journal =      "The Journal of Networks, Software Tools, and Cluster
                 Computing",
  volume =       "12",
  number =       "2",
  pages =        "101--122",
  month =        "????",
  year =         "2009",
  DOI =          "https://doi.org/10.1007/s10586-009-0080-4",
  ISSN =         "1386-7857",
  bibdate =      "Tue Jun 4 08:20:03 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Article{Yu:2009:CIC,
  author =       "Jie Yu and Satish Narayanasamy",
  title =        "A case for an interleaving constrained shared-memory
                 multi-processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "325--336",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555796",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Shared-memory multi-threaded programming is inherently
                 more difficult than single-threaded programming. The
                 main source of complexity is that, the threads of an
                 application can interleave in so many different ways.
                 To ensure correctness, a programmer has to test all
                 possible thread interleavings, which, however, is
                 impractical.\par

                 Many rare thread interleavings remain untested in
                 production systems, and they are the root cause for a
                 majority of concurrency bugs. We propose a
                 shared-memory multi-processor design that avoids
                 untested interleavings to improve the correctness of a
                 multi-threaded program. Since untested interleavings
                 tend to occur infrequently at runtime, the performance
                 cost of avoiding them is not high.\par

                 We propose to encode the set of tested correct
                 interleavings in a program's binary executable using
                 {\em Predecessor Set (PSet)\/} constraints. These
                 constraints are efficiently enforced at runtime using
                 processor support, which ensures that the runtime
                 follows a tested interleaving. We analyze several bugs
                 in open source applications such as MySQL, Apache,
                 Mozilla, etc., and show that, by enforcing PSet
                 constraints, we can avoid not only data races and
                 atomicity violations, but also other forms of
                 concurrency bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "concurrency bugs; multiprocessors; parallel
                 programming; software reliability",
}

@Article{Ziarek:2009:SWB,
  author =       "Lukasz Ziarek and Suresh Jagannathan and Matthew Fluet
                 and Umut A. Acar",
  title =        "Speculative {$N$}-way barriers (abstract only)",
  journal =      j-SIGPLAN,
  volume =       "44",
  number =       "5",
  pages =        "8--8",
  month =        may,
  year =         "2009",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1629635.1629637",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jun 21 18:01:41 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Speculative execution is an important technique that
                 has historically been used to extract concurrency from
                 sequential programs. While techniques to support
                 speculation work well when computations perform
                 relatively simple actions (e.g., reads and writes to
                 known locations), understanding speculation for
                 multi-threaded programs in which threads may
                 communicate and synchronize through multiple shared
                 references is significantly more challenging, and is
                 the focus of this paper.\par

                 We use as our reference point a simple higher-order
                 concurrent language extended with an n-way barrier and
                 a fork/join execution model. Our technique permits the
                 expression guarded by the barrier to speculatively
                 proceed before the barrier has been satisfied (i.e.,
                 before all threads that synchronize on that barrier
                 have done so) and to have participating threads that
                 would normally block on the barrier to speculatively
                 proceed as well. Our solution formulates safety
                 properties under which speculation is correct in a
                 fork/join model, and per-synchronization basis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Agarwal:2010:DDP,
  author =       "R. Agarwal and S. Bensalem and E. Farchi and K.
                 Havelund and Y. Nir-Buchbinder and S. Stoller and S. Ur
                 and L. Wang",
  title =        "Detection of deadlock potentials in multithreaded
                 programs",
  journal =      j-IBM-JRD,
  volume =       "54",
  number =       "5",
  pages =        "3:1--3:15",
  month =        "????",
  year =         "2010",
  CODEN =        "IBMJAE",
  DOI =          "https://doi.org/10.1147/JRD.2010.2060276",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Sun Feb 20 14:29:19 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Agrawal:2010:HLF,
  author =       "Kunal Agrawal and Charles E. Leiserson and Jim Sukha",
  title =        "Helper locks for fork-join parallel programming",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "245--256",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693487",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Helper locks allow programs with large parallel
                 critical sections, called parallel regions, to execute
                 more efficiently by enlisting processors that might
                 otherwise be waiting on the helper lock to aid in the
                 execution of the parallel region. Suppose that a
                 processor {\em p\/} is executing a parallel region {\em
                 A\/} after having acquired the lock {\em L\/}
                 protecting {\em A}. If another processor {\em p\/} $
                 \prime $ tries to acquire {\em L}, then instead of
                 blocking and waiting for {\em p\/} to complete {\em A},
                 processor {\em p\/} $ \prime $ joins {\em p\/} to help
                 it complete {\em A}. Additional processors not blocked
                 on {\em L\/} may also help to execute {\em A}.\par The
                 HELPER runtime system can execute fork-join
                 computations augmented with helper locks and parallel
                 regions. HELPER supports the unbounded nesting of
                 parallel regions. We provide theoretical
                 completion-time and space-usage bounds for a design of
                 HELPER based on work stealing. Specifically, let {\em
                 V\/} be the number of parallel regions in a
                 computation, let {\em T\/}$_1$ be its work, and let
                 {\em T\/} $ \infty $ be its 'aggregate span' --- the
                 sum of the spans (critical-path lengths) of all its
                 parallel regions. We prove that HELPER completes the
                 computation in expected time {\em O\/} ({\em T\/}$_1$ /
                 {\em P\/} P + {\em T\/} $ \infty $ + {\em PV\/}) on
                 {\em P\/} processors. This bound indicates that
                 programs with a small number of highly parallel
                 critical sections can attain linear speedup. For the
                 space bound, we prove that HELPER completes a program
                 using only $O(P S_1)$ stack space, where $S_1$ is the
                 sum, over all regions, of the stack space used by each
                 region in a serial execution. Finally, we describe a
                 prototype of HELPER implemented by modifying the Cilk
                 multithreaded runtime system. We used this prototype to
                 implement a concurrent hash table with a resize
                 operation protected by a helper lock.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "Cilk; dynamic multithreading; helper lock; nested
                 parallelism; parallel region; scheduling; work
                 stealing",
}

@Article{Balaji:2010:FGM,
  author =       "Pavan Balaji and Darius Buntinas and David Goodell and
                 William Gropp and Rajeev Thakur",
  title =        "Fine-Grained Multithreading Support for Hybrid
                 Threaded {MPI} Programming",
  journal =      j-IJHPCA,
  volume =       "24",
  number =       "1",
  pages =        "49--57",
  month =        feb,
  year =         "2010",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342009360206",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Tue Aug 31 09:59:45 MDT 2010",
  bibsource =    "http://hpc.sagepub.com/content/24/1.toc;
                 http://hpc.sagepub.com/content/by/year;
                 https://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://hpc.sagepub.com/content/24/1/49.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
}

@Article{Barthe:2010:SMP,
  author =       "Gilles Barthe and Tamara Rezk and Alejandro Russo and
                 Andrei Sabelfeld",
  title =        "Security of multithreaded programs by compilation",
  journal =      j-TISSEC,
  volume =       "13",
  number =       "3",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2010",
  CODEN =        "ATISBQ",
  DOI =          "https://doi.org/10.1145/1805974.1895977",
  ISSN =         "1094-9224 (print), 1557-7406 (electronic)",
  ISSN-L =       "1094-9224",
  bibdate =      "Wed Jul 28 14:57:15 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "End-to-End security of mobile code requires that the
                 code neither intentionally nor accidentally propagates
                 sensitive information to an adversary. Although mobile
                 code is commonly multithreaded low-level code, there
                 lack enforcement mechanisms that ensure information
                 security for such programs. The modularity is
                 three-fold: we give modular extensions of sequential
                 semantics, sequential security typing, and sequential
                 security-type preserving compilation that allow us
                 enforcing security for multithreaded programs. Thanks
                 to the modularity, there are no more restrictions on
                 multithreaded source programs than on sequential ones,
                 and yet we guarantee that their compilations are
                 provably secure for a wide class of schedulers.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Information and System Security",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J789",
  keywords =     "compilers; Noninterference; schedulers; type systems",
}

@Article{Bergan:2010:CCRa,
  author =       "Tom Bergan and Owen Anderson and Joseph Devietti and
                 Luis Ceze and Dan Grossman",
  title =        "{CoreDet}: a compiler and runtime system for
                 deterministic multithreaded execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "53--64",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Bergan:2010:CCRb,
  author =       "Tom Bergan and Owen Anderson and Joseph Devietti and
                 Luis Ceze and Dan Grossman",
  title =        "{CoreDet}: a compiler and runtime system for
                 deterministic multithreaded execution",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "53--64",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736029",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The behavior of a multithreaded program does not
                 depend only on its inputs. Scheduling, memory
                 reordering, timing, and low-level hardware effects all
                 introduce nondeterminism in the execution of
                 multithreaded programs. This severely complicates many
                 tasks, including debugging, testing, and automatic
                 replication. In this work, we avoid these complications
                 by eliminating their root cause: we develop a compiler
                 and runtime system that runs arbitrary multithreaded
                 C/C++ POSIX Threads programs deterministically.\par

                 A trivial nonperformant approach to providing
                 determinism is simply deterministically serializing
                 execution. Instead, we present a compiler and runtime
                 infrastructure that ensures determinism but resorts to
                 serialization rarely, for handling interthread
                 communication and synchronization. We develop two basic
                 approaches, both of which are largely dynamic with
                 performance improved by some static compiler
                 optimizations. First, an ownership-based approach
                 detects interthread communication via an evolving table
                 that tracks ownership of memory regions by threads.
                 Second, a buffering approach uses versioned memory and
                 employs a deterministic commit protocol to make changes
                 visible to other threads. While buffering has larger
                 single-threaded overhead than ownership, it tends to
                 scale better (serializing less often). A hybrid system
                 sometimes performs and scales better than either
                 approach individually.\par

                 Our implementation is based on the LLVM compiler
                 infrastructure. It needs neither programmer annotations
                 nor special hardware. Our empirical evaluation uses the
                 PARSEC and SPLASH2 benchmarks and shows that our
                 approach scales comparably to nondeterministic
                 execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "compilers; determinism; multicore; multithreading",
}

@Article{Bokhari:2010:EPM,
  author =       "Shahid Bokhari and Joel Saltz",
  title =        "Exploring the performance of massively multithreaded
                 architectures",
  journal =      j-CCPE,
  volume =       "22",
  number =       "5",
  pages =        "588--616",
  day =          "10",
  month =        apr,
  year =         "2010",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1484",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:42 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "1 Sep 2009",
}

@Article{Bronson:2010:PCB,
  author =       "Nathan G. Bronson and Jared Casper and Hassan Chafi
                 and Kunle Olukotun",
  title =        "A practical concurrent binary search tree",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "257--268",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693488",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We propose a concurrent relaxed balance AVL tree
                 algorithm that is fast, scales well, and tolerates
                 contention. It is based on optimistic techniques
                 adapted from software transactional memory, but takes
                 advantage of specific knowledge of the algorithm to
                 reduce overheads and avoid unnecessary retries. We
                 extend our algorithm with a fast linearizable clone
                 operation, which can be used for consistent iteration
                 of the tree. Experimental evidence shows that our
                 algorithm outperforms a highly tuned concurrent skip
                 list for many access patterns, with an average of 39\%
                 higher single-threaded throughput and 32\% higher
                 multi-threaded throughput over a range of contention
                 levels and operation mixes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "optimistic concurrency; snapshot isolation",
}

@Article{Burnim:2010:ACD,
  author =       "Jacob Burnim and Koushik Sen",
  title =        "Asserting and checking determinism for multithreaded
                 programs",
  journal =      j-CACM,
  volume =       "53",
  number =       "6",
  pages =        "97--105",
  month =        jun,
  year =         "2010",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/1743546.1743572",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Mon Jun 21 12:34:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/cacm/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Chen:2010:CCM,
  author =       "Changno Chen and Marc Moreno Maza and Yuzhen Xie",
  title =        "Cache complexity and multicore implementation for
                 univariate real root isolation",
  journal =      j-ACM-COMM-COMP-ALGEBRA,
  volume =       "44",
  number =       "3",
  pages =        "97--98",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1940475.1940483",
  ISSN =         "1932-2232 (print), 1932-2240 (electronic)",
  ISSN-L =       "1932-2232",
  bibdate =      "Thu Mar 31 10:24:16 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Isolating the real roots of a univariate polynomial is
                 a driving subject in computer algebra. This problem has
                 been studied under various angles from algebraic
                 algorithms [1, 2, 7] to implementation techniques [3,
                 5]. Today, multicores are the most popular parallel
                 hardware architectures. Beside, understanding the
                 implications of hierarchical memory on performance
                 software engineering has become essential. These
                 observations motivate our study. We analyze the cache
                 complexity of the core routine of many real root
                 isolation algorithms namely, the Taylor shift. Then, we
                 present efficient multithreaded implementation on
                 multicores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Communications in Computer Algebra",
  issue =        "173",
}

@Article{Chetlur:2010:SWM,
  author =       "M. Chetlur and U. Devi and P. Dutta and P. Gupta and
                 L. Chen and Z. Zhu and S. Kalyanaraman and Y. Lin",
  title =        "A software {WiMAX} medium access control layer using
                 massively multithreaded processors",
  journal =      j-IBM-JRD,
  volume =       "54",
  number =       "1",
  pages =        "??--??",
  month =        "????",
  year =         "2010",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Sat May 1 17:44:14 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.research.ibm.com/journal/",
  URL =          "http://www.research.ibm.com/journal/abstracts/rd/541/chetlur-dutta.html",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Choi:2010:MDA,
  author =       "Jee W. Choi and Amik Singh and Richard W. Vuduc",
  title =        "Model-driven autotuning of sparse matrix-vector
                 multiply on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "115--126",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693471",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present a performance model-driven framework for
                 automated performance tuning (autotuning) of sparse
                 matrix-vector multiply (SpMV) on systems accelerated by
                 graphics processing units (GPU). Our study consists of
                 two parts.\par

                 First, we describe several carefully hand-tuned SpMV
                 implementations for GPUs, identifying key GPU-specific
                 performance limitations, enhancements, and tuning
                 opportunities. These implementations, which include
                 variants on classical blocked compressed sparse row
                 (BCSR) and blocked ELLPACK (BELLPACK) storage formats,
                 match or exceed state-of-the-art implementations. For
                 instance, our best BELLPACK implementation achieves up
                 to 29.0 Gflop/s in single-precision and 15.7 Gflop/s in
                 double-precision on the NVIDIA T10P multiprocessor
                 (C1060), enhancing prior state-of-the-art unblocked
                 implementations (Bell and Garland, 2009) by up to
                 1.8\times and 1.5\times for single-and double-precision
                 respectively.\par

                 However, achieving this level of performance requires
                 input matrix-dependent parameter tuning. Thus, in the
                 second part of this study, we develop a performance
                 model that can guide tuning. Like prior autotuning
                 models for CPUs (e.g., Im, Yelick, and Vuduc, 2004),
                 this model requires offline measurements and run-time
                 estimation, but more directly models the structure of
                 multithreaded vector processors like GPUs. We show that
                 our model can identify the implementations that achieve
                 within 15\% of those found through exhaustive search.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "GPU; performance modeling; sparse matrix-vector
                 multiplication",
}

@Article{Coons:2010:GEU,
  author =       "Katherine E. Coons and Sebastian Burckhardt and
                 Madanlal Musuvathi",
  title =        "{GAMBIT}: effective unit testing for concurrency
                 libraries",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "15--24",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693458",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As concurrent programming becomes prevalent, software
                 providers are investing in concurrency libraries to
                 improve programmer productivity. Concurrency libraries
                 improve productivity by hiding error-prone, low-level
                 synchronization from programmers and providing
                 higher-level concurrent abstractions. Testing such
                 libraries is difficult, however, because concurrency
                 failures often manifest only under particular
                 scheduling circumstances. Current best testing
                 practices are often inadequate: heuristic-guided
                 fuzzing is not systematic, systematic schedule
                 enumeration does not find bugs quickly, and stress
                 testing is neither systematic nor fast.\par

                 To address these shortcomings, we propose a prioritized
                 search technique called GAMBIT that combines the speed
                 benefits of heuristic-guided fuzzing with the
                 soundness, progress, and reproducibility guarantees of
                 stateless model checking. GAMBIT combines known
                 techniques such as partial-order reduction and
                 preemption-bounding with a generalized best-first
                 search frame- work that prioritizes schedules likely to
                 expose bugs. We evaluate GAMBIT's effectiveness on
                 newly released concurrency libraries for Microsoft's
                 .NET framework. Our experiments show that GAMBIT finds
                 bugs more quickly than prior stateless model checking
                 techniques without compromising coverage guarantees or
                 reproducibility.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; model checking; multithreading;
                 partial-order reduction; preemption bound; software
                 testing",
}

@Article{Dam:2010:PCI,
  author =       "Mads Dam and Bart Jacobs and Andreas Lundblad and
                 Frank Piessens",
  title =        "Provably correct inline monitoring for multithreaded
                 {Java}-like programs",
  journal =      j-J-COMP-SECUR,
  volume =       "18",
  number =       "1",
  pages =        "37--59",
  month =        "????",
  year =         "2010",
  CODEN =        "JCSIET",
  DOI =          "https://doi.org/10.3233/JCS-2010-0365",
  ISSN =         "0926-227X (print), 1875-8924 (electronic)",
  ISSN-L =       "0926-227X",
  bibdate =      "Tue May 24 06:24:34 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computer Security",
  journal-URL =  "http://content.iospress.com/journals/journal-of-computer-security",
}

@Article{Ding:2010:PCM,
  author =       "Jason Jianxun Ding and Abdul Waheed and Jingnan Yao
                 and Laxmi N. Bhuyan",
  title =        "Performance characterization of multi-thread and
                 multi-core processors based {XML} application oriented
                 networking systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "70",
  number =       "5",
  pages =        "584--597",
  month =        may,
  year =         "2010",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 16:27:28 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Dohi:2010:IPE,
  author =       "Keisuke Dohi and Yuichiro Shibata and Tsuyoshi Hamada
                 and Tomonari Masada and Kiyoshi Oguri and Duncan A.
                 Buell",
  title =        "Implementation of a programming environment with a
                 multithread model for reconfigurable systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "40--45",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926375",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Reconfigurable systems are known to be able to achieve
                 higher performance than traditional microprocessor
                 architecture for many application fields. However, in
                 order to extract a full potential of the reconfigurable
                 systems, programmers often have to design and describe
                 the best suited code for their target architecture with
                 specialized knowledge. The aim of this paper is to
                 assist the users of reconfigurable systems by
                 implementing a translator with a multithread model. The
                 experimental results show our translator automatically
                 generates efficient performance-aware code segments
                 including DMA transfer and shift registers for memory
                 access optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Eggers:2010:AL,
  author =       "Susan Eggers",
  title =        "{2010 Athena} lecture",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "98--98",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806608",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Susan Eggers, a Professor of Computer Science and
                 Engineering at the University of Washington, joined her
                 department in 1989. She received a B.A. in 1965 from
                 Connecticut College and a Ph.D. in 1989 from the
                 University of California, Berkeley. Her research
                 interests are in computer architecture and back-end
                 compiler optimization, with an emphasis on experimental
                 performance analysis. With her colleague Hank Levy and
                 their students, she developed the first commercially
                 viable multithreaded architecture, Simultaneous
                 Multithreading, adopted by Intel (as Hyperthreading),
                 IBM, Sun and others. Her current research is in the
                 areas of distributed dataflow machines, FPGAs and chip
                 multiprocessors. In 1989 Professor Eggers was awarded
                 an IBM Faculty Development Award, in 1990 an NSF
                 Presidential Young Investigator Award, in 1994 the
                 Microsoft Professorship in Computer Science and
                 Engineering, and in 2009 the ACM-W Athena Lecturer. She
                 is a Fellow of the ACM and IEEE, a Fellow of the AAAS,
                 and a member of the National Academy of Engineering.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "invited talk",
}

@Article{Eyerman:2010:PJS,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Probabilistic job symbiosis modeling for {SMT}
                 processor scheduling",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "91--102",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736033",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Symbiotic job scheduling boosts simultaneous
                 multithreading (SMT) processor performance by
                 co-scheduling jobs that have `compatible' demands on
                 the processor's shared resources. Existing approaches
                 however require a sampling phase, evaluate a limited
                 number of possible co-schedules, use heuristics to
                 gauge symbiosis, are rigid in their optimization
                 target, and do not preserve system-level
                 priorities/shares.\par

                 This paper proposes probabilistic job symbiosis
                 modeling, which predicts whether jobs will create
                 positive or negative symbiosis when co-scheduled
                 without requiring the co-schedule to be evaluated. The
                 model, which uses per-thread cycle stacks computed
                 through a previously proposed cycle accounting
                 architecture, is simple enough to be used in system
                 software. Probabilistic job symbiosis modeling provides
                 six key innovations over prior work in symbiotic job
                 scheduling: (i) it does not require a sampling phase,
                 (ii) it readjusts the job co-schedule continuously,
                 (iii) it evaluates a large number of possible
                 co-schedules at very low overhead, (iv) it is not
                 driven by heuristics, (v) it can optimize a performance
                 target of interest (e.g., system throughput or job
                 turnaround time), and (vi) it preserves system-level
                 priorities/shares. These innovations make symbiotic job
                 scheduling both practical and effective.\par

                 Our experimental evaluation, which assumes a realistic
                 scenario in which jobs come and go, reports an average
                 16\% (and up to 35\%) reduction in job turnaround time
                 compared to the previously proposed SOS (sample,
                 optimize, symbios) approach for a two-thread SMT
                 processor, and an average 19\% (and up to 45\%)
                 reduction in job turnaround time for a four-thread SMT
                 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "performance modeling; simultaneous multi-threading
                 (SMT); symbiotic job scheduling",
}

@Article{Flanagan:2010:AMD,
  author =       "Cormac Flanagan and Stephen N. Freund",
  title =        "Adversarial memory for detecting destructive races",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "244--254",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multithreaded programs are notoriously prone to race
                 conditions, a problem exacerbated by the widespread
                 adoption of multi-core processors with complex memory
                 models and cache coherence protocols. Much prior work
                 has focused on static and dynamic analyses for race
                 detection, but these algorithms typically are unable to
                 distinguish destructive races that cause erroneous
                 behavior from benign races that do not. Performing this
                 classification manually is difficult, time consuming,
                 and error prone.\par

                 This paper presents a new dynamic analysis technique
                 that uses {\em adversarial memory\/} to classify race
                 conditions as destructive or benign on systems with
                 relaxed memory models. Unlike a typical language
                 implementation, which may only infrequently exhibit
                 non-sequentially consistent behavior, our adversarial
                 memory implementation exploits the full freedom of the
                 memory model to return older, unexpected, or stale
                 values for memory reads whenever possible, in an
                 attempt to crash the target program (that is, to force
                 the program to behave erroneously). A crashing
                 execution provides concrete evidence of a destructive
                 bug, and this bug can be strongly correlated with a
                 specific race condition in the target
                 program.\par

                 Experimental results with our Jumble prototype for Java
                 demonstrate that adversarial memory is highly effective
                 at identifying destructive race conditions, and in
                 distinguishing them from race conditions that are real
                 but benign. Adversarial memory can also reveal
                 destructive races that would not be detected by
                 traditional testing (even after thousands of runs) or
                 by model checkers that assume sequential consistency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "concurrency; dynamic analysis; race conditions;
                 relaxed memory models",
}

@Article{Gibson:2010:FSC,
  author =       "Dan Gibson and David A. Wood",
  title =        "{Forwardflow}: a scalable core for power-constrained
                 {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "14--25",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815966",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Chip Multiprocessors (CMPs) are now commodity
                 hardware, but commoditization of parallel software
                 remains elusive. In the near term, the current trend of
                 increased core-per-socket count will continue, despite
                 a lack of parallel software to exercise the hardware.
                 Future CMPs must deliver thread-level parallelism when
                 software provides threads to run, but must also
                 continue to deliver performance gains for single
                 threads by exploiting instruction-level parallelism and
                 memory-level parallelism. However, power limitations
                 will prevent conventional cores from exploiting both
                 simultaneously.\par

                 This work presents the Forwardflow Architecture, which
                 can scale its execution logic up to run single threads,
                 or down to run multiple threads in a CMP. Forwardflow
                 dynamically builds an explicit internal dataflow
                 representation from a conventional instruction set
                 architecture, using forward dependence pointers to
                 guide instruction wakeup, selection, and issue.
                 Forwardflow's backend is organized into discrete units
                 that can be individually (de-)activated, allowing each
                 core's performance to be scaled by system software at
                 the architectural level.\par

                 On single threads, Forwardflow core scaling yields a
                 mean runtime reduction of 21\% for a 37\% increase in
                 power consumption. For multithreaded workloads, a
                 Forwardflow-based CMP allows system software to select
                 the performance point that best matches available
                 power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "chip multiprocessor (cmp); power; scalable core",
}

@Article{Gupta:2010:CSM,
  author =       "M. Gupta and F. Sanchez and J. Llosa",
  title =        "{CSMT}: Simultaneous Multithreading for Clustered
                 {VLIW} Processors",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "59",
  number =       "3",
  pages =        "385--399",
  month =        mar,
  year =         "2010",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2009.96",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Sun Jul 3 11:52:26 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5161255",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Hilton:2010:SDE,
  author =       "Andrew Hilton and Amir Roth",
  title =        "{SMT-Directory}: Efficient Load-Load Ordering for
                 {SMT}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Memory models like SC, TSO, and PC enforce load-load
                 ordering, requiring that loads from any single thread
                 appear to occur in program order to all other threads.
                 Out-of-order execution can violate load-load ordering.
                 Conventional multi-processors with out-of-order cores
                 detect load-load ordering violations by snooping an
                 age-ordered load queue on cache invalidations or
                 evictions-events that act as proxies for the completion
                 of remote stores. This mechanism becomes less efficient
                 in an SMT processor, as every completing store must
                 search the loads queue segments of all other threads.
                 This inefficiency exists because store completions from
                 other threads in the same core are not filtered by the
                 cache and coherence protocol: thread 0 observes all of
                 thread 1's stores, not only the first store to every
                 cache line. SMT-Directory eliminates this overhead by
                 implementing the filtering traditionally provided by
                 the cache in the cache itself. SMT-Directory adds a
                 per-thread ``{read''} bit to every data cache line.
                 When a load executes, it sets the bit corresponding to
                 its thread. When a store completes and write to the
                 cache, it checks the SMT-Directory bits of its cache
                 line and searches the load queue segments only of those
                 threads whose bits are set. As a result, local store
                 completions trigger searches only for data that is
                 actually shared.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hilton, A (Reprint Author), Univ Penn, Philadelphia,
                 PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn,
                 Philadelphia, PA 19104 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0541292]",
  funding-text = "We thank Arun Raghavan for the address traces and Milo
                 Martin for comments on early versions of this work. The
                 anonymous reviewers provided valuable feedback. This
                 work was supported by NSF award CCF-0541292.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "consistency models; directory; load queue search;
                 load-load ordering; Simultaneous multithreading",
  keywords-plus = "CONSISTENCY",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hilton:2010:SDE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Illikkal:2010:PQP,
  author =       "Ramesh Illikkal and Vineet Chadha and Andrew Herdrich
                 and Ravi Iyer and Donald Newell",
  title =        "{PIRATE}: {QoS} and performance management in {CMP}
                 architectures",
  journal =      j-SIGMETRICS,
  volume =       "37",
  number =       "4",
  pages =        "3--10",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1773394.1773396",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Wed Aug 25 07:35:13 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As new multi-threaded usage models such as
                 virtualization and consolidation take advantage of
                 multiple cores in CMP architectures, the impact of
                 shared resource contention between VMs and user-level
                 applications introduces Quality of Service(QoS)
                 concerns and challenges. QoS-aware management of these
                 shared platform resources is therefore becoming
                 increasingly important. Various QoS schemes for
                 resource management have been recently proposed, but
                 most of these prior efforts have been focused on
                 controlling individual resource allocation based on
                 priority information passed down from the OS or
                 Hypervisor to system resources. The complexity of this
                 approach increases when multiple levels of resources
                 are associated with an application's performance and
                 power consumption. In this paper we employ simpler
                 rate-based QoS mechanisms which control the execution
                 rate of competing applications. To enable
                 differentiation between simultaneously running
                 applications' performance and power consumption, these
                 rate mechanisms need to dynamically adjust the
                 execution of application. Our proposed PI-RATE
                 architecture introduces a control-theoretic approach to
                 dynamically adjust the execution rate of each
                 application based on the QoS target and monitored
                 resource utilization. We evaluate three modes of
                 PI-RATE architecture --- cache QoS targets, performance
                 QoS targets and power QoS targets --- to show that the
                 PI-RATE architecture is flexible and effective at
                 enabling QoS in a CMP platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "clock modulation; frequency scaling; integral
                 controller; proportional",
}

@Article{Jang:2010:DTE,
  author =       "Byunghyun Jang and Perhaad Mistry and Dana Schaa and
                 Rodrigo Dominguez and David Kaeli",
  title =        "Data transformations enabling loop vectorization on
                 multithreaded data parallel architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "353--354",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Loop vectorization, a key feature exploited to obtain
                 high performance on Single Instruction Multiple Data
                 (SIMD) vector architectures, is significantly hindered
                 by irregular memory access patterns in the data stream.
                 This paper describes data transformations that allow us
                 to vectorize loops targeting massively multithreaded
                 data parallel architectures. We present a mathematical
                 model that captures loop-based memory access patterns
                 and computes the most appropriate data transformations
                 in order to enable vectorization. Our experimental
                 results show that the proposed data transformations can
                 significantly increase the number of loops that can be
                 vectorized and enhance the data-level parallelism of
                 applications. Our results also show that the overhead
                 associated with our data transformations can be easily
                 amortized as the size of the input data set increases.
                 For the set of high performance benchmark kernels
                 studied, we achieve consistent and significant
                 performance improvements (up to 11.4X) by applying
                 vectorization using our data transformation approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "data transformation; GPGPU; loop vectorization",
}

@Article{Laadan:2010:TLA,
  author =       "Oren Laadan and Nicolas Viennot and Jason Nieh",
  title =        "Transparent, lightweight application execution replay
                 on commodity multiprocessor operating systems",
  journal =      j-SIGMETRICS,
  volume =       "38",
  number =       "1",
  pages =        "155--166",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1811039.1811057",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Wed Aug 25 07:35:52 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present Scribe, the first system to provide
                 transparent, low-overhead application record-replay and
                 the ability to go live from replayed execution. Scribe
                 introduces new lightweight operating system mechanisms,
                 rendezvous and sync points, to efficiently record
                 nondeterministic interactions such as related system
                 calls, signals, and shared memory accesses. Rendezvous
                 points make a partial ordering of execution based on
                 system call dependencies sufficient for replay,
                 avoiding the recording overhead of maintaining an exact
                 execution ordering. Sync points convert asynchronous
                 interactions that can occur at arbitrary times into
                 synchronous events that are much easier to record and
                 replay.\par

                 We have implemented Scribe without changing, relinking,
                 or recompiling applications, libraries, or operating
                 system kernels, and without any specialized hardware
                 support such as hardware performance counters. It works
                 on commodity Linux operating systems, and commodity
                 multi-core and multiprocessor hardware. Our results
                 show for the first time that an operating system
                 mechanism can correctly and transparently record and
                 replay multi-process and multi-threaded applications on
                 commodity multiprocessors. Scribe recording overhead is
                 less than 2.5\% for server applications including
                 Apache and MySQL, and less than 15\% for desktop
                 applications including Firefox, Acrobat, OpenOffice,
                 parallel kernel compilation, and movie playback.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "debugging; fault-tolerance; record-replay;
                 virtualization",
}

@Article{Lee:2010:REO,
  author =       "Dongyoon Lee and Benjamin Wester and Kaushik
                 Veeraraghavan and Satish Narayanasamy and Peter M. Chen
                 and Jason Flinn",
  title =        "{Respec}: efficient online multiprocessor replay via
                 speculation and external determinism",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "77--90",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736031",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Deterministic replay systems record and reproduce the
                 execution of a hardware or software system. While it is
                 well known how to replay uniprocessor systems,
                 replaying shared memory multiprocessor systems at low
                 overhead on commodity hardware is still an open
                 problem. This paper presents Respec, a new way to
                 support deterministic replay of shared memory
                 multithreaded programs on commodity multiprocessor
                 hardware. Respec targets online replay in which the
                 recorded and replayed processes execute
                 concurrently.\par

                 Respec uses two strategies to reduce overhead while
                 still ensuring correctness: speculative logging and
                 externally deterministic replay. Speculative logging
                 optimistically logs less information about shared
                 memory dependencies than is needed to guarantee
                 deterministic replay, then recovers and retries if the
                 replayed process diverges from the recorded process.
                 Externally deterministic replay relaxes the degree to
                 which the two executions must match by requiring only
                 their system output and final program states match. We
                 show that the combination of these two techniques
                 results in low recording and replay overhead for the
                 common case of data-race-free execution intervals and
                 still ensures correct replay for execution intervals
                 that have data races.\par

                 We modified the Linux kernel to implement our
                 techniques. Our software system adds on average about
                 18\% overhead to the execution time for recording and
                 replaying programs with two threads and 55\% overhead
                 for programs with four threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "external determinism; replay; speculative execution",
}

@Article{Lin:2010:TAC,
  author =       "Yi-Neng Lin and Ying-Dar Lin and Yuan-Cheng Lai",
  title =        "Thread allocation in {CMP}-based multithreaded network
                 processors",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "36",
  number =       "2--3",
  pages =        "104--116",
  month =        feb # "\slash " # mar,
  year =         "2010",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Sep 2 17:51:12 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/01678191",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Mannarswamy:2010:CAS,
  author =       "Sandya Mannarswamy and Dhruva R. Chakrabarti and
                 Kaushik Rajan and Sujoy Saraswati",
  title =        "Compiler aided selective lock assignment for improving
                 the performance of software transactional memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "37--46",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693460",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Atomic sections have been recently introduced as a
                 language construct to improve the programmability of
                 concurrent software. They simplify programming by not
                 requiring the explicit specification of locks for
                 shared data. Typically atomic sections are supported in
                 software either through the use of optimistic
                 concurrency by using transactional memory or through
                 the use of pessimistic concurrency using
                 compiler-assigned locks. As a software transactional
                 memory (STM) system does not take advantage of the
                 specific memory access patterns of an application it
                 often suffers from false conflicts and high validation
                 overheads. On the other hand, the compiler usually ends
                 up assigning coarse grain locks as it relies on whole
                 program points-to analysis which is conservative by
                 nature. This adversely affects performance by limiting
                 concurrency. In order to mitigate the disadvantages
                 associated with STM's lock assignment scheme, we
                 propose a hybrid approach which combines STM's lock
                 assignment with a compiler aided selective lock
                 assignment scheme (referred to as SCLA-STM). SCLA-STM
                 overcomes the inefficiencies associated with a purely
                 compile-time lock assignment approach by (i) using the
                 underlying STM for shared variables where only a
                 conservative analysis is possible by the compiler
                 (e.g., in the presence of may-alias points to
                 information) and (ii) being selective about the shared
                 data chosen for the compiler-aided lock assignment. We
                 describe our prototype SCLA-STM scheme implemented in
                 the HP-UX IA-64 C/C++ compiler, using TL2 as our STM
                 implementation. We show that SCLA-STM improves
                 application performance for certain STAMP benchmarks
                 from 1.68\% to 37.13\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "compilers; multithreading; parallelization;
                 performance",
}

@Article{Marino:2010:DSE,
  author =       "Daniel Marino and Abhayendra Singh and Todd Millstein
                 and Madanlal Musuvathi and Satish Narayanasamy",
  title =        "{DRFX}: a simple and efficient memory model for
                 concurrent programming languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "351--362",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806636",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The most intuitive memory model for shared-memory
                 multithreaded programming is {\em sequential
                 consistency\/} (SC), but it disallows the use of many
                 compiler and hardware optimizations thereby impacting
                 performance. Data-race-free (DRF) models, such as the
                 proposed C++0x memory model, guarantee SC execution for
                 datarace-free programs. But these models provide no
                 guarantee at all for racy programs, compromising the
                 safety and debuggability of such programs. To address
                 the safety issue, the Java memory model, which is also
                 based on the DRF model, provides a weak semantics for
                 racy executions. However, this semantics is subtle and
                 complex, making it difficult for programmers to reason
                 about their programs and for compiler writers to ensure
                 the correctness of compiler optimizations.\par

                 We present the DRFx memory model, which is simple for
                 programmers to understand and use while still
                 supporting many common optimizations. We introduce a
                 {\em memory model (MM) exception\/} which can be
                 signaled to halt execution. If a program executes
                 without throwing this exception, then DRFx guarantees
                 that the execution is SC. If a program throws an MM
                 exception during an execution, then DRFx guarantees
                 that the program has a data race. We observe that SC
                 violations can be detected in hardware through a
                 lightweight form of conflict detection. Furthermore,
                 our model safely allows aggressive compiler and
                 hardware optimizations within compiler-designated
                 program regions. We formalize our memory model, prove
                 several properties about this model, describe a
                 compiler and hardware design suitable for DRFx, and
                 evaluate the performance overhead due to our compiler
                 and hardware requirements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "data races; memory model exception; memory models;
                 sequential consistency; soft fences",
}

@Article{McKenney:2010:WGM,
  author =       "Paul E. McKenney and Maged M. Michael and Josh
                 Triplett and Jonathan Walpole",
  title =        "Why the grass may not be greener on the other side: a
                 comparison of locking vs. transactional memory",
  journal =      j-OPER-SYS-REV,
  volume =       "44",
  number =       "3",
  pages =        "93--101",
  month =        jul,
  year =         "2010",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1842733.1842749",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Aug 19 14:21:54 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The advent of multi-core and multi-threaded processor
                 architectures highlights the need to address the
                 well-known shortcomings of the ubiquitous lock-based
                 synchronization mechanisms. To this end, transactional
                 memory has been viewed by many as a promising
                 alternative to locking. This paper therefore presents a
                 constructive critique of locking and transactional
                 memory: their strengths, weaknesses, and opportunities
                 for improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Meng:2010:AOS,
  author =       "Lingchuan Meng and Jeremy Johnson and Franz Franchetti
                 and Yevgen Voronenko and Marc Moreno Maza and Yuzhen
                 Xie",
  title =        "Abstract only: {SPIRAL}-generated modular {FFTs}",
  journal =      j-ACM-COMM-COMP-ALGEBRA,
  volume =       "44",
  number =       "2",
  pages =        "25--26",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1838599.1838616",
  ISSN =         "1932-2232 (print), 1932-2240 (electronic)",
  ISSN-L =       "1932-2232",
  bibdate =      "Mon Aug 2 13:47:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this poster we present the use of the SPIRAL system
                 (www.spiral.net) to generate code for modular Fast
                 Fourier Transforms (FFTs). SPIRAL is a library
                 generation system that automatically generates
                 platform-tuned implementations of digital signal
                 processing algorithms with an emphasis on fast
                 transforms. Currently, SPRIAL can generate highly
                 optimized fixed point and floating-point FFTs for a
                 variety of platforms including vectorization,
                 multi-threaded and distributed memory parallelization.
                 The code produced is competitive with the best
                 available code for these platforms and SPIRAL is used
                 by Intel for its IPP (Intel Performance Primitives) and
                 MKL (Math kernel Library) libraries.\par

                 The SPIRAL system uses a mathematical framework for
                 representing and deriving algorithms. Algorithms are
                 derived using rewrite rules and additional rules are
                 used to symbolically manipulate algorithms into forms
                 that take advantage of the underlying hardware. A
                 search engine with a feedback loop is used to tune
                 implementations to particular platforms. New transforms
                 are added by introducing new symbols and their
                 definition and new algorithms can be generated by
                 adding new rules.\par

                 We extended SPIRAL to generate algorithms for FFT
                 computation over finite fields. This addition required
                 adding a new data type, several new rules and a new
                 transform (ModDFT) definition. In addition, the
                 unparser (where code is generated) was extended so that
                 it can generate scalar and vectorized code for modular
                 arithmetic. With these enhancements, the SPRIAL
                 machinery can be applied to modular transforms that are
                 of interest to the computer algebra community. This
                 provides a framework for systematically optimizing
                 these transforms, utilizing vector and parallel
                 computation, and for automatically tuning them to
                 different platforms. In this poster we present
                 preliminary results from this exploration. We show that
                 the code generated by SPIRAL, with improved cache
                 locality and vectorization, is approximately ten times
                 faster than the modular FFT code in the modpn
                 library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Communications in Computer Algebra",
  issue =        "172",
}

@Article{Meng:2010:DWS,
  author =       "Jiayuan Meng and David Tarjan and Kevin Skadron",
  title =        "Dynamic warp subdivision for integrated branch and
                 memory divergence tolerance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "235--246",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815992",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "SIMD organizations amortize the area and power of
                 fetch, decode, and issue logic across multiple
                 processing units in order to maximize throughput for a
                 given area and power budget. However, throughput is
                 reduced when a set of threads operating in lockstep (a
                 warp) are stalled due to long latency memory accesses.
                 The resulting idle cycles are extremely costly.
                 Multi-threading can hide latencies by interleaving the
                 execution of multiple warps, but deep multi-threading
                 using many warps dramatically increases the cost of the
                 register files (multi-threading depth $ \times $ SIMD
                 width), and cache contention can make performance
                 worse. Instead, intra-warp latency hiding should first
                 be exploited. This allows threads that are ready but
                 stalled by SIMD restrictions to use these idle cycles
                 and reduces the need for multi-threading among warps.
                 This paper introduces {\em dynamic warp subdivision\/}
                 (DWS), which allows a single warp to occupy more than
                 one slot in the scheduler without requiring extra
                 register file space. Independent scheduling entities
                 allow divergent branch paths to interleave their
                 execution, and allow threads that hit to run ahead. The
                 result is improved latency hiding and memory level
                 parallelism (MLP). We evaluate the technique on a
                 coherent cache hierarchy with private L1 caches and a
                 shared L2 cache. With an area overhead of less than
                 1\%, experiments with eight data-parallel benchmarks
                 show our technique improves performance on average by
                 1.7$ \times $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  keywords =     "branch divergence; cache; latency hiding; memory
                 divergence; SIMD; warp",
}

@Article{Muralidhara:2010:IAS,
  author =       "Sai Prashanth Muralidhara and Mahmut Kandemir and
                 Padma Raghavan",
  title =        "Intra-application shared cache partitioning for
                 multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "329--330",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693498",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper, we address the problem of partitioning
                 a shared cache when the executing threads belong to the
                 same application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "cache; multicore; parallel applications",
}

@Article{Nakaike:2010:LER,
  author =       "Takuya Nakaike and Maged M. Michael",
  title =        "Lock elision for read-only critical sections in
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "269--278",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806627",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "It is not uncommon in parallel workloads to encounter
                 shared data structures with read-mostly access
                 patterns, where operations that update data are
                 infrequent and most operations are read-only.
                 Typically, data consistency is guaranteed using mutual
                 exclusion or read-write locks. The cost of atomic
                 update of lock variables result in high overheads and
                 high cache coherence traffic under active sharing, thus
                 slowing down single thread performance and limiting
                 scalability.\par

                 In this paper, we present {\em SOLERO (Software
                 Optimistic Lock Elision for Read-Only critical
                 sections)}, a new lock implementation called for
                 optimizing read-only critical sections in Java based on
                 sequential locks. SOLERO is compatible with the
                 conventional lock implementation of Java. However,
                 unlike the conventional implementation, only critical
                 sections that may write data or have side effects need
                 to update lock variables, while read-only critical
                 sections need only read lock variables without writing
                 them. Each writing critical section changes the lock
                 value to a new value. Hence, a read-only critical
                 section is guaranteed to be consistent if the lock is
                 free and its value does not change from the beginning
                 to the end of the read-only critical section.\par

                 Using Java workloads including SPECjbb2005 and the
                 HashMap and TreeMap Java classes, we evaluate the
                 performance impact of applying SOLERO to read-mostly
                 locks. Our experimental results show performance
                 improvements across the board, often substantial, in
                 both single thread speed and scalability over the
                 conventional lock implementation (mutual exclusion) and
                 read-write locks. SOLERO improves the performance of
                 SPECjbb2005 by 3-5\% on single and multiple threads.
                 The results using the HashMap and TreeMap benchmarks
                 show that SOLERO outperforms the conventional lock
                 implementation and read-write locks by substantial
                 multiples on multi-threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "java; just-in-time compiler; lock; lock elision;
                 monitor; optimization; synchronization",
}

@Article{Park:2010:ISP,
  author =       "Jung-Wook Park and Hoon-Mo Yang and Gi-Ho Park and
                 Shin-Dug Kim and Charles C. Weems",
  title =        "An instruction-systolic programmable shader
                 architecture for multi-threaded {$3$D} graphics
                 processing",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "70",
  number =       "11",
  pages =        "1110--1118",
  month =        nov,
  year =         "2010",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Sep 1 16:27:29 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Radojkovic:2010:TSB,
  author =       "Petar Radojkovi{\'c} and Vladimir {\v{C}}akarevi{\'c}
                 and Javier Verd{\'u} and Alex Pajuelo and Francisco J.
                 Cazorla and Mario Nemirovsky and Mateo Valero",
  title =        "Thread to strand binding of parallel network
                 applications in massive multi-threaded systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "191--202",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693480",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In processors with several levels of hardware resource
                 sharing,like CMPs in which each core is an SMT, the
                 scheduling process becomes more complex than in
                 processors with a single level of resource sharing,
                 such as pure-SMT or pure-CMP processors. Once the
                 operating system selects the set of applications to
                 simultaneously schedule on the processor (workload),
                 each application/thread must be assigned to one of the
                 hardware contexts(strands). We call this last
                 scheduling step the Thread to Strand Binding or TSB. In
                 this paper, we show that the TSB impact on the
                 performance of processors with several levels of shared
                 resources is high. We measure a variation of up to 59\%
                 between different TSBs of real multithreaded network
                 applications running on the UltraSPARC T2 processor
                 which has three levels of resource sharing. In our
                 view, this problem is going to be more acute in future
                 multithreaded architectures comprising more cores, more
                 contexts per core, and more levels of resource
                 sharing.\par

                 We propose a resource-sharing aware TSB algorithm
                 (TSBSched) that significantly facilitates the problem
                 of thread to strand binding for software-pipelined
                 applications, representative of multithreaded network
                 applications. Our systematic approach encapsulates
                 both, the characteristics of multithreaded processors
                 under the study and the structure of the software
                 pipelined applications. Once calibrated for a given
                 processor architecture, our proposal does not require
                 hardware knowledge on the side of the programmer, nor
                 extensive profiling of the application. We validate our
                 algorithm on the UltraSPARC T2 processor running a set
                 of real multithreaded network applications on which we
                 report improvements of up to 46\% compared to the
                 current state-of-the-art dynamic schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "CMT; process scheduling; simultaneous multithreading;
                 UltraSPARC T2",
}

@Article{Rakvic:2010:TMT,
  author =       "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G.
                 Magklis and P. Chaparro and A. Gonz{\'a}lez",
  title =        "Thread-management techniques to maximize efficiency in
                 multicore and simultaneous multithreaded
                 microprocessors",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839667.1839671",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We provide an analysis of thread-management techniques
                 that increase performance or reduce energy in multicore
                 and Simultaneous Multithreaded (SMT) cores. Thread
                 delaying reduces energy consumption by running the core
                 containing the critical thread at maximum frequency
                 while scaling down the frequency and voltage of the
                 cores containing noncritical threads. In this article,
                 we provide an insightful breakdown of thread delaying
                 on a simulated multi-core microprocessor. Thread
                 balancing improves overall performance by giving higher
                 priority to the critical thread in the issue queue of
                 an SMT core. We provide a detailed breakdown of
                 performance results for thread-balancing, identifying
                 performance benefits and limitations. For those
                 benchmarks where a performance benefit is not possible,
                 we introduce a novel thread-balancing mechanism on an
                 SMT core that can reduce energy consumption. We have
                 performed a detailed study on an Intel microprocessor
                 simulator running parallel applications. Thread
                 delaying can reduce energy consumption by 4\% to 44\%
                 with negligible performance loss. Thread balancing can
                 increase performance by 20\% or can reduce energy
                 consumption by 23\%.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "critical threads; energy-aware; low-power; Meeting
                 point thread characterization; microarchitecture;
                 multi-threaded application; thread balancing; thread
                 delaying",
}

@Article{Raman:2010:SPUa,
  author =       "Arun Raman and Hanjun Kim and Thomas R. Mason and
                 Thomas B. Jablin and David I. August",
  title =        "Speculative parallelization using software
                 multi-threaded transactions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "65--76",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Raman:2010:SPUb,
  author =       "Arun Raman and Hanjun Kim and Thomas R. Mason and
                 Thomas B. Jablin and David I. August",
  title =        "Speculative parallelization using software
                 multi-threaded transactions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "65--76",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the right techniques, multicore architectures may
                 be able to continue the exponential performance trend
                 that elevated the performance of applications of all
                 types for decades. While many scientific programs can
                 be parallelized without speculative techniques,
                 speculative parallelism appears to be the key to
                 continuing this trend for general-purpose applications.
                 Recently-proposed code parallelization techniques, such
                 as those by Bridges et al. and by Thies et al.,
                 demonstrate scalable performance on multiple cores by
                 using speculation to divide code into atomic units
                 (transactions) that span multiple threads in order to
                 expose data parallelism. Unfortunately, most software
                 and hardware Thread-Level Speculation (TLS) memory
                 systems and transactional memories are not sufficient
                 because they only support single-threaded atomic units.
                 Multi-threaded Transactions (MTXs) address this
                 problem, but they require expensive hardware support as
                 currently proposed in the literature. This paper
                 proposes a Software MTX (SMTX) system that captures the
                 {\em applicability\/} and {\em performance\/} of
                 hardware MTX, but on {\em existing multicore machines}.
                 The SMTX system yields a harmonic mean speedup of
                 13.36x on native hardware with four 6-core processors
                 (24 cores in total) running speculatively parallelized
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "automatic parallelization; loop-level parallelism;
                 multi-threaded transactions; pipelined parallelism;
                 software transactional memory; thread-level
                 speculation",
}

@Article{Rashid:2010:AEP,
  author =       "Layali Rashid and Wessam M. Hassanein and Moustafa A.
                 Hammad",
  title =        "Analyzing and enhancing the parallel sort operation on
                 multithreaded architectures",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "53",
  number =       "2",
  pages =        "293--312",
  month =        aug,
  year =         "2010",
  CODEN =        "JOSUED",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Wed Aug 25 08:39:00 MDT 2010",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=53&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=53&issue=2&spage=293",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Sanchez:2010:ACI,
  author =       "Daniel Sanchez and George Michelogiannakis and
                 Christos Kozyrakis",
  title =        "An analysis of on-chip interconnection networks for
                 large-scale chip multiprocessors",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1756065.1736069",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the number of cores of chip multiprocessors
                 (CMPs) rapidly growing as technology scales down,
                 connecting the different components of a CMP in a
                 scalable and efficient way becomes increasingly
                 challenging. In this article, we explore the
                 architectural-level implications of interconnection
                 network design for CMPs with up to 128 fine-grain
                 multithreaded cores. We evaluate and compare different
                 network topologies using accurate simulation of the
                 full chip, including the memory hierarchy and
                 interconnect, and using a diverse set of scientific and
                 engineering workloads.\par

                 We find that the interconnect has a large impact on
                 performance, as it is responsible for 60\% to 75\% of
                 the miss latency. Latency, and not bandwidth, is the
                 primary performance constraint, since, even with many
                 threads per core and workloads with high miss rates,
                 networks with enough bandwidth can be efficiently
                 implemented for the system scales we consider. From the
                 topologies we study, the flattened butterfly
                 consistently outperforms the mesh and fat tree on all
                 workloads, leading to performance advantages of up to
                 22\%. We also show that considering interconnect and
                 memory hierarchy together when designing large-scale
                 CMPs is crucial, and neglecting either of the two can
                 lead to incorrect conclusions. Finally, the effect of
                 the interconnect on overall performance becomes more
                 important as the number of cores increases, making
                 interconnection choices especially critical when
                 scaling up.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "chip multiprocessors; hierarchical networks;
                 Networks-on-chip",
}

@Article{Sodan:2010:PMM,
  author =       "Angela C. Sodan and Jacob Machina and Arash Deshmeh
                 and Kevin Macnaughton and Bryan Esbaugh",
  title =        "Parallelism via Multithreaded and Multicore {CPUs}",
  journal =      j-COMPUTER,
  volume =       "43",
  number =       "3",
  pages =        "24--32",
  month =        mar,
  year =         "2010",
  CODEN =        "CPTRB4",
  DOI =          "https://doi.org/10.1109/MC.2010.75",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Wed May 12 22:57:42 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Soundararajan:2010:CSE,
  author =       "Niranjan Soundararajan and Anand Sivasubramaniam and
                 Vijay Narayanan",
  title =        "Characterizing the soft error vulnerability of
                 multicores running multithreaded applications",
  journal =      j-SIGMETRICS,
  volume =       "38",
  number =       "1",
  pages =        "379--380",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1811099.1811096",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Wed Aug 25 07:35:52 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multicores have become the platform of choice across
                 all market segments. Cost-effective protection against
                 soft errors is important in these environments, due to
                 the need to move to lower technology generations and
                 the exploding number of transistors on a chip. While
                 multicores offer the flexibility of varying the number
                 of application threads and the number of cores on which
                 they run, the reliability impact of choosing one
                 configuration over another is unclear. Our study
                 reveals that the reliability costs vary dramatically
                 between configurations and being unaware could lead to
                 a sub-optimal choice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
  keywords =     "fit rate; multicore; soft errors",
}

@Article{Sutherland:2010:CTC,
  author =       "Dean F. Sutherland and William L. Scherlis",
  title =        "Composable thread coloring",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "233--244",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693485",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper introduces the language-independent concept
                 of ``thread usage policy.'' Many multi-threaded
                 software systems contain policies that regulate
                 associations among threads, executable code, and
                 potentially shared state. A system, for example, may
                 constrain which threads are permitted to execute
                 particular code segments, usually as a means to
                 constrain those threads from accessing or writing
                 particular elements of state. These policies ensure
                 properties such as state confinement or reader/writer
                 constraints, often without recourse to locking or
                 transaction discipline.\par

                 Our approach allows developers to concisely document
                 their thread usage policies in a manner that enables
                 the use of sound scalable analysis to assess
                 consistency of policy and as-written code. This paper
                 identifies the key semantic concepts of our thread
                 coloring language and illustrates how to use its
                 succinct source-level annotations to express models of
                 thread usage policies, following established annotation
                 conventions for Java.\par

                 We have built a prototype static analysis tool,
                 implemented as an integrated development environment
                 plug-in (for the Eclipse IDE), that notifies developers
                 of discrepancies between policy annotations and
                 as-written code. Our analysis technique uses several
                 underlying algorithms based on abstract interpretation,
                 call-graphs, and type inference. The resulting overall
                 analysis is both sound and composable. We have used
                 this prototype analysis tool in case studies to model
                 and analyze more than a million lines of code.\par

                 Our validation process included field trials on a wide
                 variety of complex large-scale production code selected
                 by the host organizations. Our in-field experience led
                 us to focus on potential adoptability by real-world
                 developers. We have developed techniques that can
                 reduce annotation density to less than one line per
                 thousand lines of code (KLOC). In addition, the
                 prototype analysis tool supports an incremental and
                 iterative approach to modeling and analysis. This
                 approach enabled field trial partners to directly
                 target areas of greatest concern and to achieve useful
                 results within a few hours.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "annotation; Java; keywords: state consistency;
                 multicore; race conditions; state confinement; thread
                 policy",
}

@Article{Tallent:2010:ALC,
  author =       "Nathan R. Tallent and John M. Mellor-Crummey and Allan
                 Porterfield",
  title =        "Analyzing lock contention in multithreaded
                 applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "269--280",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693489",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Many programs exploit shared-memory parallelism using
                 multithreading. Threaded codes typically use locks to
                 coordinate access to shared data. In many cases,
                 contention for locks reduces parallel efficiency and
                 hurts scalability. Being able to quantify and attribute
                 lock contention is important for understanding where a
                 multithreaded program needs improvement.\par

                 This paper proposes and evaluates three strategies for
                 gaining insight into performance losses due to lock
                 contention. First, we consider using a straightforward
                 strategy based on call stack profiling to attribute
                 idle time and show that it fails to yield insight into
                 lock contention. Second, we consider an approach that
                 builds on a strategy previously used for analyzing
                 idleness in work-stealing computations; we show that
                 this strategy does not yield insight into lock
                 contention. Finally, we propose a new technique for
                 measurement and analysis of lock contention that uses
                 data associated with locks to blame lock holders for
                 the idleness of spinning threads. Our approach incurs $
                 \leq $ 5\% overhead on a quantum chemistry application
                 that makes extensive use of locking (65M distinct
                 locks, a maximum of 340K live locks, and an average of
                 30K lock acquisitions per second per thread) and
                 attributes lock contention to its full static and
                 dynamic calling contexts. Our strategy, implemented in
                 HPCToolkit, is fully distributed and should scale well
                 to systems with large core counts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "HPCToolkit; lock contention; multithreading;
                 performance analysis",
}

@Article{Tentyukov:2010:MVF,
  author =       "M. Tentyukov and J. A. M. Vermaseren",
  title =        "The multithreaded version of {FORM}",
  journal =      j-COMP-PHYS-COMM,
  volume =       "181",
  number =       "8",
  pages =        "1419--1427",
  month =        aug,
  year =         "2010",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2010.04.009",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Feb 11 09:54:30 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465510001207",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Tian:2010:SPU,
  author =       "Chen Tian and Min Feng and Rajiv Gupta",
  title =        "Speculative parallelization using state separation and
                 multiple value prediction",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "63--72",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806663",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the availability of chip multiprocessor (CMP) and
                 simultaneous multithreading (SMT) machines, extracting
                 thread level parallelism from a sequential program has
                 become crucial for improving performance. However, many
                 sequential programs cannot be easily parallelized due
                 to the presence of dependences. To solve this problem,
                 different solutions have been proposed. Some of them
                 make the optimistic assumption that such dependences
                 rarely manifest themselves at runtime. However, when
                 this assumption is violated, the recovery causes very
                 large overhead. Other approaches incur large
                 synchronization or computation overhead when resolving
                 the dependences. Consequently, for a loop with
                 frequently arising cross-iteration dependences,
                 previous techniques are not able to speed up the
                 execution. In this paper we propose a compiler
                 technique which uses state separation and multiple
                 value prediction to speculatively parallelize loops in
                 sequential programs that contain frequently arising
                 cross-iteration dependences. The key idea is to
                 generate multiple versions of a loop iteration based on
                 multiple predictions of values of variables involved in
                 cross-iteration dependences (i.e., live-in variables).
                 These speculative versions and the preceding loop
                 iteration are executed in separate memory states
                 simultaneously. After the execution, if one of these
                 versions is correct (i.e., its predicted values are
                 found to be correct), then we merge its state and the
                 state of the preceding iteration because the dependence
                 between the two iterations is correctly resolved. The
                 memory states of other incorrect versions are
                 completely discarded. Based on this idea, we further
                 propose a runtime adaptive scheme that not only gives a
                 good performance but also achieves better CPU
                 utilization. We conducted experiments on 10 benchmark
                 programs on a real machine. The results show that our
                 technique can achieve 1.7x speedup on average across
                 all used benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "multicore processors; speculative parallelization",
}

@Article{Torlak:2010:MCA,
  author =       "Emina Torlak and Mandana Vaziri and Julian Dolby",
  title =        "{MemSAT}: checking axiomatic specifications of memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "341--350",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806635",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Memory models are hard to reason about due to their
                 complexity, which stems from the need to strike a
                 balance between ease-of-programming and allowing
                 compiler and hardware optimizations. In this paper, we
                 present an automated tool, MemSAT, that helps in
                 debugging and reasoning about memory models. Given an
                 axiomatic specification of a memory model and a
                 multi-threaded test program containing assertions,
                 MemSAT outputs a trace of the program in which both the
                 assertions and the memory model axioms are satisfied,
                 if one can be found. The tool is fully automatic and is
                 based on a SAT solver. If it cannot find a trace, it
                 outputs a minimal subset of the memory model and
                 program constraints that are unsatisfiable. We used
                 MemSAT to check several existing memory models against
                 their published test cases, including the current Java
                 Memory Model by Manson et al. and a revised version of
                 it by Sevcik and Aspinall. We found subtle
                 discrepancies between what was expected and the actual
                 results of test programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "axiomatic specifications; bounded model checking;
                 memory models; sat",
}

@Article{Trott:2010:AVI,
  author =       "Oleg Trott and Arthur J. Olson",
  title =        "{AutoDock Vina}: {Improving} the speed and accuracy of
                 docking with a new scoring function, efficient
                 optimization, and multithreading",
  journal =      j-J-COMPUT-CHEM,
  volume =       "31",
  number =       "2",
  pages =        "455--461",
  day =          "30",
  month =        jan,
  year =         "2010",
  CODEN =        "JCCHDD",
  DOI =          "https://doi.org/10.1002/jcc.21334",
  ISSN =         "0192-8651 (print), 1096-987X (electronic)",
  ISSN-L =       "0192-8651",
  bibdate =      "Thu Nov 29 14:55:23 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/0192-8651;
                 https://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Chemistry",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
  onlinedate =   "4 Jun 2009",
}

@Article{Vlachos:2010:PEAa,
  author =       "Evangelos Vlachos and Michelle L. Goodstein and
                 Michael A. Kozuch and Shimin Chen and Babak Falsafi and
                 Phillip B. Gibbons and Todd C. Mowry",
  title =        "{ParaLog}: enabling and accelerating online parallel
                 monitoring of multithreaded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "271--284",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Vlachos:2010:PEAb,
  author =       "Evangelos Vlachos and Michelle L. Goodstein and
                 Michael A. Kozuch and Shimin Chen and Babak Falsafi and
                 Phillip B. Gibbons and Todd C. Mowry",
  title =        "{ParaLog}: enabling and accelerating online parallel
                 monitoring of multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "271--284",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736051",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "{\em Instruction-grain lifeguards\/} monitor the
                 events of a running application at the level of
                 individual instructions in order to identify and help
                 mitigate application bugs and security exploits.
                 Because such lifeguards impose a 10-100X slowdown on
                 existing platforms, previous studies have proposed
                 hardware designs to accelerate lifeguard processing.
                 However, these accelerators are either tailored to a
                 specific class of lifeguards or suitable only for
                 monitoring single-threaded programs.\par

                 We present ParaLog, the first design of a system
                 enabling fast online parallel monitoring of
                 multithreaded parallel applications. ParaLog supports a
                 broad class of software-defined lifeguards. We show how
                 three existing accelerators can be enhanced to support
                 online multithreaded monitoring, dramatically reducing
                 lifeguard overheads. We identify and solve several
                 challenges in monitoring parallel applications and/or
                 parallelizing these accelerators, including (i)
                 enforcing inter-thread data dependences, (ii) dealing
                 with inter-thread effects that are not reflected in
                 coherence traffic, (iii) dealing with unmonitored
                 operating system activity, and (iv) ensuring lifeguards
                 can access shared metadata with negligible
                 synchronization overheads. We present our system design
                 for both Sequentially Consistent and Total Store
                 Ordering processors. We implement and evaluate our
                 design on a 16 core simulated CMP, using benchmarks
                 from SPLASH-2 and PARSEC and two lifeguards: a
                 data-flow tracking lifeguard and a memory-access
                 checker lifeguard. Our results show that (i) our
                 parallel accelerators improve performance by 2-9X and
                 1.13-3.4X for our two lifeguards, respectively, (ii) we
                 are 5-126X faster than the time-slicing approach
                 required by existing techniques, and (iii) our average
                 overheads for applications with eight threads are 51\%
                 and 28\% for the two lifeguards, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "hardware support for debugging; instruction-grain
                 lifeguards; online parallel monitoring",
}

@Article{Welch:2010:SCF,
  author =       "Peter H. Welch and Jan B. Pedersen",
  title =        "{Santa Claus}: {Formal} analysis of a process-oriented
                 solution",
  journal =      j-TOPLAS,
  volume =       "32",
  number =       "4",
  pages =        "14:1--14:37",
  month =        apr,
  year =         "2010",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/1734206.1734211",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri May 21 12:47:03 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "With the commercial development of multicore
                 processors, the challenges of writing multithreaded
                 programs to take advantage of these new hardware
                 architectures are becoming more and more pertinent.
                 Concurrent programming is necessary to achieve the
                 performance that the hardware offers. Traditional
                 approaches present concurrency as an {\em advanced\/}
                 topic: they have proven difficult to use, reason about
                 with confidence, and scale up to high levels of
                 concurrency. This article reviews {\em process-oriented
                 design}, based on Hoare's algebra of Communicating
                 Sequential Processes (CSP), and proposes that this
                 approach to concurrency leads to solutions that are
                 manageable by novice programmers; that is, they are
                 easy to design and maintain, that they are scalable for
                 complexity, {\em obviously correct}, and relatively
                 easy to verify using formal reasoning and/or model
                 checkers. These solutions can be developed in
                 conventional programming languages (through CSP
                 libraries) or specialized ones (such as occam-\pi) in a
                 manner that directly reflects their formal expression.
                 Systems can be developed without needing specialist
                 knowledge of the CSP formalism, since the supporting
                 mathematics is burnt into the tools and languages
                 supporting it. We illustrate these concepts with the
                 {\em Santa Claus problem}, which has been used as a
                 challenge for concurrency mechanisms since 1994. We
                 consider this problem as an example control system,
                 producing external signals reporting changes of
                 internal state (that model the external world). We
                 claim our occam-\pi solution is {\em
                 correct-by-design}, but follow this up with formal
                 verification (using the FDR model checker for CSP) that
                 the system is free from deadlock and livelock, that the
                 produced control signals obey crucial ordering
                 constraints, and that the system has key liveness
                 properties.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
  keywords =     "concurrency; CSP; deadlock; event ordering; liveness;
                 novice programmer; occam-pi; Process orientation;
                 verification",
}

@Article{Wendykier:2010:PCH,
  author =       "Piotr Wendykier and James G. Nagy",
  title =        "{Parallel Colt}: a High-Performance {Java} Library for
                 Scientific Computing and Image Processing",
  journal =      j-TOMS,
  volume =       "37",
  number =       "3",
  pages =        "31:1--31:22",
  month =        sep,
  year =         "2010",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1824801.1824809",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Sep 27 10:15:50 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Major breakthroughs in chip and software design have
                 been observed for the last nine years. In October 2001,
                 IBM released the world's first multicore processor:
                 POWER4. Six years later, in February 2007, NVIDIA made
                 a public release of CUDA SDK, a set of development
                 tools to write algorithms for execution on Graphic
                 Processing Units (GPUs). Although software vendors have
                 started working on parallelizing their products, the
                 vast majority of existing code is still sequential and
                 does not effectively utilize modern multicore CPUs and
                 manycore GPUs.\par

                 This article describes Parallel Colt, a multithreaded
                 Java library for scientific computing and image
                 processing. In addition to describing the design and
                 functionality of Parallel Colt, a comparison to MATLAB
                 is presented. Two ImageJ plugins for iterative image
                 deblurring and motion correction of PET brain images
                 are described as typical applications of this library.
                 Performance comparisons with MATLAB, including GPU
                 computations via AccelerEyes' Jacket toolbox are also
                 given.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "Deconvolution; FFT; inverse problems; iterative
                 methods; motion correction; multithreading; PET;
                 regularization",
}

@Article{Wheeler:2010:VMM,
  author =       "Kyle B. Wheeler and Douglas Thain",
  title =        "Visualizing massively multithreaded applications with
                 {ThreadScope}",
  journal =      j-CCPE,
  volume =       "22",
  number =       "1",
  pages =        "45--67",
  month =        jan,
  year =         "2010",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1469",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:40 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Prac\-tice and
                 Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "13 Aug 2009",
}

@Article{Yi:2010:NAS,
  author =       "Kyueun Yi and J.-L. Gaudiot",
  title =        "Network Applications on Simultaneous Multithreading
                 Processors",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "59",
  number =       "9",
  pages =        "1200--1209",
  month =        sep,
  year =         "2010",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2009.185",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Sun Jul 3 11:52:32 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5374374",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Zhang:2010:DCS,
  author =       "Eddy Z. Zhang and Yunlian Jiang and Xipeng Shen",
  title =        "Does cache sharing on modern {CMP} matter to the
                 performance of contemporary multithreaded programs?",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "203--212",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693482",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Most modern Chip Multiprocessors (CMP) feature shared
                 cache on chip. For multithreaded applications, the
                 sharing reduces communication latency among co-running
                 threads, but also results in cache contention.\par

                 A number of studies have examined the influence of
                 cache sharing on multithreaded applications, but most
                 of them have concentrated on the design or management
                 of shared cache, rather than a systematic measurement
                 of the influence. Consequently, prior measurements have
                 been constrained by the reliance on simulators, the use
                 of out-of-date benchmarks, and the limited coverage of
                 deciding factors. The influence of CMP cache sharing on
                 contemporary multithreaded applications remains
                 preliminarily understood.\par

                 In this work, we conduct a systematic measurement of
                 the influence on two kinds of commodity CMP machines,
                 using a recently released CMP benchmark suite, PARSEC,
                 with a number of potentially important factors on
                 program, OS, and architecture levels considered. The
                 measurement shows some surprising results. Contrary to
                 commonly perceived importance of cache sharing, neither
                 positive nor negative effects from the cache sharing
                 are significant for most of the program executions,
                 regardless of the types of parallelism, input datasets,
                 architectures, numbers of threads, and assignments of
                 threads to cores. After a detailed analysis, we find
                 that the main reason is the mismatch of current
                 development and compilation of multithreaded
                 applications and CMP architectures. By transforming the
                 programs in a cache-sharing-aware manner, we observe up
                 to 36\% performance increase when the threads are
                 placed on cores appropriately.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "chip multiprocessors; parallel program optimizations;
                 shared cache; thread scheduling",
}

@Article{Zhang:2010:FTS,
  author =       "Yao Zhang and Jonathan Cohen and John D. Owens",
  title =        "Fast tridiagonal solvers on the {GPU}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "127--136",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693472",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We study the performance of three parallel algorithms
                 and their hybrid variants for solving tridiagonal
                 linear systems on a GPU: cyclic reduction (CR),
                 parallel cyclic reduction (PCR) and recursive doubling
                 (RD). We develop an approach to measure, analyze, and
                 optimize the performance of GPU programs in terms of
                 memory access, computation, and control overhead. We
                 find that CR enjoys linear algorithm complexity but
                 suffers from more algorithmic steps and bank conflicts,
                 while PCR and RD have fewer algorithmic steps but do
                 more work each step. To combine the benefits of the
                 basic algorithms, we propose hybrid CR+PCR and CR+RD
                 algorithms, which improve the performance of PCR, RD
                 and CR by 21\%, 31\% and 61\% respectively. Our GPU
                 solvers achieve up to a 28x speedup over a sequential
                 LAPACK solver, and a 12x speedup over a multi-threaded
                 CPU solver.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "GPGPU; performance optimization; tridiagonal linear
                 system",
}

@Article{Zier:2010:PED,
  author =       "David A. Zier and Ben Lee",
  title =        "Performance Evaluation of Dynamic Speculative
                 Multithreading with the {Cascadia} Architecture",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "21",
  number =       "1",
  pages =        "47--59",
  month =        jan,
  year =         "2010",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2009.47",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu May 13 12:06:56 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Bajaj:2011:FFP,
  author =       "Chandrajit L. Bajaj and Rezaul Chowdhury and Vinay
                 Siddahanavalli",
  title =        "{$ F^2 $Dock}: Fast {Fourier} Protein-Protein
                 Docking",
  journal =      j-TCBB,
  volume =       "8",
  number =       "1",
  pages =        "45--58",
  month =        jan,
  year =         "2011",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2009.57",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Mon Dec 20 18:39:04 MST 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The functions of proteins are often realized through
                 their mutual interactions. Determining a relative
                 transformation for a pair of proteins and their
                 conformations which form a stable complex, reproducible
                 in nature, is known as docking. It is an important step
                 in drug design, structure determination, and
                 understanding function and structure relationships. In
                 this paper, we extend our nonuniform fast Fourier
                 transform-based docking algorithm to include an
                 adaptive search phase (both translational and
                 rotational) and thereby speed up its execution. We have
                 also implemented a multithreaded version of the
                 adaptive docking algorithm for even faster execution on
                 multicore machines. We call this protein-protein
                 docking code {\rm F}^2Dock (F^2= {\rm
                 \underline{F}ast\underline{F}ourier}).",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Ball:2011:PPT,
  author =       "Thomas Ball and Sebastian Burckhardt and Peli de
                 Halleux and Madan Musuvathi and Shaz Qadeer",
  title =        "Predictable and Progressive Testing of Multithreaded
                 Code",
  journal =      j-IEEE-SOFTWARE,
  volume =       "28",
  number =       "3",
  pages =        "75--83",
  month =        may # "\slash " # jun,
  year =         "2011",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/MS.2010.64",
  ISSN =         "0740-7459 (print), 0740-7459 (electronic)",
  ISSN-L =       "0740-7459",
  bibdate =      "Thu Apr 28 08:41:06 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Software",
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/software",
}

@Article{Bientinesi:2011:CFS,
  author =       "Paolo Bientinesi and Francisco D. Igual and Daniel
                 Kressner and Matthias Petschow and Enrique S.
                 Quintana-Ort{\'\i}",
  title =        "Condensed forms for the symmetric eigenvalue problem
                 on multi-threaded architectures",
  journal =      j-CCPE,
  volume =       "23",
  number =       "7",
  pages =        "694--707",
  month =        may,
  year =         "2011",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1680",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Dec 5 10:08:55 MST 2011",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "8 Nov 2010",
}

@Article{Burnim:2011:SCSa,
  author =       "Jacob Burnim and George Necula and Koushik Sen",
  title =        "Specifying and checking semantic atomicity for
                 multithreaded programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "79--90",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950377",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Burnim:2011:SCSb,
  author =       "Jacob Burnim and George Necula and Koushik Sen",
  title =        "Specifying and checking semantic atomicity for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "79--90",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Butler:2011:BAM,
  author =       "Michael Butler and Leslie Barnes and Debjit Das Sarma
                 and Bob Gelinas",
  title =        "{Bulldozer}: An Approach to Multithreaded Compute
                 Performance",
  journal =      j-IEEE-MICRO,
  volume =       "31",
  number =       "2",
  pages =        "6--15",
  month =        mar # "\slash " # apr,
  year =         "2011",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2011.23",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Tue Apr 26 13:50:28 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "AMD's Bulldozer module represents a new direction in
                 microarchitecture and includes a number of firsts for
                 AMD, including AMD's multithreaded x86 processor,
                 implementation of a shared Level 2 cache, and x86
                 processor to incorporate floating-point
                 multiply-accumulate (FMAC). This article discusses the
                 module's multithreading architecture, power-efficient
                 microarchitecture, and subblocks, including the various
                 microarchitectural latencies, bandwidths, and structure
                 sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
  keywords =     "Hot Chips 22 conference proceedings",
}

@Article{Chen:2011:MJP,
  author =       "Kuo-Yi Chen and J. Morris Chang and Ting-Wei Hou",
  title =        "Multithreading in {Java}: Performance and Scalability
                 on Multicore Systems",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "60",
  number =       "11",
  pages =        "1521--1534",
  month =        nov,
  year =         "2011",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2010.232",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Sep 27 07:57:50 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5661769",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Chinya:2011:BDP,
  author =       "Gautham N. Chinya and Jamison D. Collins and Perry H.
                 Wang and Hong Jiang and Guei-Yuan Lueh and Thomas A.
                 Piazza and Hong Wang",
  title =        "{Bothnia}: a dual-personality extension to the {Intel}
                 integrated graphics driver",
  journal =      j-OPER-SYS-REV,
  volume =       "45",
  number =       "1",
  pages =        "11--20",
  month =        jan,
  year =         "2011",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1945023.1945027",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Feb 25 16:43:23 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper, we introduce Bothnia, an extension to
                 the Intel production graphics driver to support a
                 shared virtual memory heterogeneous multithreading
                 programming model. With Bothnia, the Intel graphics
                 device driver can support both the traditional 3D
                 graphics rendering software stack and a new class of
                 heterogeneous multithreaded applications, which can use
                 both IA (Intel Architecture) CPU cores and Intel
                 integrated Graphics and Media Accelerator (GMA) cores
                 in the same virtual address space. We describe the
                 necessary architectural supports in both IA CPU and the
                 GMA cores and present a reference Bothnia
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Davis:2011:ASM,
  author =       "Timothy A. Davis",
  title =        "{Algorithm 915}, {SuiteSparseQR}: {Multifrontal}
                 multithreaded rank-revealing sparse {QR}
                 factorization",
  journal =      j-TOMS,
  volume =       "38",
  number =       "1",
  pages =        "8:1--8:22",
  month =        nov,
  year =         "2011",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2049662.2049670",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Dec 15 08:59:34 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "SuiteSparseQR is a sparse QR factorization package
                 based on the multifrontal method. Within each frontal
                 matrix, LAPACK and the multithreaded BLAS enable the
                 method to obtain high performance on multicore
                 architectures. Parallelism across different frontal
                 matrices is handled with Intel's Threading Building
                 Blocks library. The symbolic analysis and ordering
                 phase pre-eliminates singletons by permuting the input
                 matrix A into the form [R11 R12; 0 A22] where R11 is
                 upper triangular with diagonal entries above a given
                 tolerance. Next, the fill-reducing ordering, column
                 elimination tree, and frontal matrix structures are
                 found without requiring the formation of the pattern of
                 ATA. Approximate rank-detection is performed within
                 each frontal matrix using Heath's method.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Esparza:2011:CPB,
  author =       "Javier Esparza and Pierre Ganty",
  title =        "Complexity of pattern-based verification for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "499--510",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926443",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Feinbube:2011:JFM,
  author =       "Frank Feinbube and Peter Troger and Andreas Polze",
  title =        "Joint Forces: From Multithreaded Programming to {GPU}
                 Computing",
  journal =      j-IEEE-SOFTWARE,
  volume =       "28",
  number =       "1",
  pages =        "51--57",
  month =        jan # "\slash " # feb,
  year =         "2011",
  CODEN =        "IESOEG",
  DOI =          "https://doi.org/10.1109/MS.2010.134",
  ISSN =         "0740-7459 (print), 0740-7459 (electronic)",
  ISSN-L =       "0740-7459",
  bibdate =      "Thu Dec 23 16:29:15 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Software",
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/software",
}

@InProceedings{Ganesan:2011:MMP,
  author =       "Karthik Ganesan and Lizy K. John",
  title =        "{MAximum Multicore POwer (MAMPO)}: an automatic
                 multithreaded synthetic power virus generation
                 framework for multicore systems",
  crossref =     "Lathrop:2011:SPI",
  pages =        "53:1--53:12",
  year =         "2011",
  DOI =          "https://doi.org/10.1145/2063384.2063455",
  bibdate =      "Fri Dec 16 11:05:47 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
  articleno =    "53",
}

@Article{Gupta:2011:PAR,
  author =       "Ashutosh Gupta and Corneliu Popeea and Andrey
                 Rybalchenko",
  title =        "Predicate abstraction and refinement for verifying
                 multi-threaded programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "331--344",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926424",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Hong:2011:AMA,
  author =       "Bo Hong and Zhengyu He",
  title =        "An Asynchronous Multithreaded Algorithm for the
                 Maximum Network Flow Problem with Nonblocking Global
                 Relabeling Heuristic",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "22",
  number =       "6",
  pages =        "1025--1033",
  month =        jun,
  year =         "2011",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2010.156",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Jul 22 07:53:43 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Hsu:2011:MSS,
  author =       "Chia-Jui Hsu and Jos{\'e} Luis Pino and Shuvra S.
                 Bhattacharyya",
  title =        "Multithreaded Simulation for Synchronous Dataflow
                 Graphs",
  journal =      j-TODAES,
  volume =       "16",
  number =       "3",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2011",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1970353.1970358",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Tue Jun 14 11:55:50 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "For system simulation, Synchronous DataFlow (SDF) has
                 been widely used as a core model of computation in
                 design tools for digital communication and signal
                 processing systems. The traditional approach for
                 simulating SDF graphs is to compute and execute static
                 schedules in single-processor desktop environments.
                 Nowadays, however, multicore processors are
                 increasingly popular desktop platforms for their
                 potential performance improvements through thread-level
                 parallelism. Without novel scheduling and simulation
                 techniques that explicitly explore thread-level
                 parallelism for executing SDF graphs, current design
                 tools gain only minimal performance improvements on
                 multicore platforms. In this article, we present a new
                 multithreaded simulation scheduler, called MSS, to
                 provide simulation runtime speedup for executing SDF
                 graphs on multicore processors.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Jeffrey:2011:IBM,
  author =       "Dennis Jeffrey and Yan Wang and Chen Tian and Rajiv
                 Gupta",
  title =        "Isolating bugs in multithreaded programs using
                 execution suppression",
  journal =      j-SPE,
  volume =       "41",
  number =       "11",
  pages =        "1259--1288",
  month =        oct,
  year =         "2011",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.1040",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Thu Sep 29 14:49:13 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/spe.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Software --- Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "18 Jan 2011",
}

@Article{Joisha:2011:TEA,
  author =       "Pramod G. Joisha and Robert S. Schreiber and
                 Prithviraj Banerjee and Hans J. Boehm and Dhruva R.
                 Chakrabarti",
  title =        "A technique for the effective and automatic reuse of
                 classical compiler optimizations on multithreaded
                 code",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "623--636",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926457",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Li:2011:FSM,
  author =       "Guodong Li and Robert Palmer and Michael DeLisi and
                 Ganesh Gopalakrishnan and Robert M. Kirby",
  title =        "Formal specification of {MPI 2.0}: {Case} study in
                 specifying a practical concurrent programming {API}",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "76",
  number =       "2",
  pages =        "65--81",
  day =          "1",
  month =        feb,
  year =         "2011",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Fri Apr 1 18:39:40 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib;
                 http://www.sciencedirect.com/science/journal/01676423",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423/",
}

@Article{Li:2011:LCM,
  author =       "Sheng Li and Shannon Kuntz and Jay B. Brockman and
                 Peter M. Kogge",
  title =        "{Lightweight Chip Multi-Threading (LCMT)}: Maximizing
                 Fine-Grained Parallelism On-Chip",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "22",
  number =       "7",
  pages =        "1178--1191",
  month =        jul,
  year =         "2011",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2010.169",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Jul 22 07:54:38 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Liao:2011:AUB,
  author =       "Xiongfei Liao and Thambipillai Srikanthan",
  title =        "Accelerating {UNISIM}-Based Cycle-Level
                 Microarchitectural Simulations on Multicore Platforms",
  journal =      j-TODAES,
  volume =       "16",
  number =       "3",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2011",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/1970353.1970359",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Tue Jun 14 11:55:50 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "UNISIM has been shown to ease the development of
                 simulators for multi-/many-core systems. However,
                 UNISIM cycle-level simulations of large-scale
                 multiprocessor systems could be very time consuming. In
                 this article, we propose a systematic framework for
                 accelerating UNISIM cycle-level simulations on
                 multicore platforms. The proposed framework relies on
                 exploiting the fine-grained parallelism within the
                 simulated cycles using POSIX threads. A multithreaded
                 simulation engine has been devised from the
                 single-threaded UNISIM SystemC engine to facilitate the
                 exploitation of inherent parallelism. An adaptive
                 technique that manages the overall computation workload
                 by adjusting the number of threads employed at any
                 given time is proposed. In addition, we have introduced
                 a technique to balance the workloads of multithreaded
                 executions.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Ma:2011:SPC,
  author =       "Kai Ma and Xue Li and Ming Chen and Xiaorui Wang",
  title =        "Scalable power control for many-core architectures
                 running multi-threaded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "449--460",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000117",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Mahafzah:2011:PMI,
  author =       "Basel A. Mahafzah",
  title =        "Parallel multithreaded {IDA*} heuristic search:
                 algorithm design and performance evaluation",
  journal =      j-INT-J-PAR-EMER-DIST-SYS,
  volume =       "26",
  number =       "1",
  pages =        "61--82",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1080/17445761003604521",
  ISSN =         "1744-5760 (print), 1744-5779 (electronic)",
  ISSN-L =       "1744-5760",
  bibdate =      "Mon Sep 5 20:33:09 MDT 2011",
  bibsource =    "http://www.informaworld.com/smpp/title~content=t713729127~link=cover;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.tandfonline.com/loi/gpaa20",
  onlinedate =   "6 Dec 2010",
}

@Article{Marino:2011:CSP,
  author =       "Daniel Marino and Abhayendra Singh and Todd Millstein
                 and Madanlal Musuvathi and Satish Narayanasamy",
  title =        "A case for an {SC}-preserving compiler",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "199--210",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The most intuitive memory consistency model for
                 shared-memory multi-threaded programming is sequential
                 consistency (SC). However, current concurrent
                 programming languages support a relaxed model, as such
                 relaxations are deemed necessary for enabling important
                 optimizations. This paper demonstrates that an
                 SC-preserving compiler, one that ensures that every SC
                 behavior of a compiler-generated binary is an SC
                 behavior of the source program, retains most of the
                 performance benefits of an optimizing compiler. The key
                 observation is that a large class of optimizations
                 crucial for performance are either already
                 SC-preserving or can be modified to preserve SC while
                 retaining much of their effectiveness. An SC-preserving
                 compiler, obtained by restricting the optimization
                 phases in LLVM, a state-of-the-art C/C++ compiler,
                 incurs an average slowdown of 3.8\% and a maximum
                 slowdown of 34\% on a set of 30 programs from the
                 SPLASH-2, PARSEC, and SPEC CINT2006 benchmark
                 suites.\par

                 While the performance overhead of preserving SC in the
                 compiler is much less than previously assumed, it might
                 still be unacceptable for certain applications. We
                 believe there are several avenues for improving
                 performance without giving up SC-preservation. In this
                 vein, we observe that the overhead of our SC-preserving
                 compiler arises mainly from its inability to
                 aggressively perform a class of optimizations we
                 identify as eager-load optimizations. This class
                 includes common-subexpression elimination, constant
                 propagation, global value numbering, and common cases
                 of loop-invariant code motion. We propose a notion of
                 interference checks in order to enable eager-load
                 optimizations while preserving SC. Interference checks
                 expose to the compiler a commonly used hardware
                 speculation mechanism that can efficiently detect
                 whether a particular variable has changed its value
                 since last read.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  keywords =     "LLVM compiler suite; sequential consistency (SC)",
}

@InProceedings{Preissl:2011:MGA,
  author =       "Robert Preissl and Nathan Wichmann and Bill Long and
                 John Shalf and Stephane Ethier and Alice Koniges",
  title =        "Multithreaded Global Address Space Communication
                 Techniques for Gyrokinetic Fusion Applications on
                 Ultra-Scale Platforms",
  crossref =     "Lathrop:2011:SPI",
  pages =        "12:1--12:11",
  year =         "2011",
  DOI =          "https://doi.org/10.1145/2063384.2071033",
  bibdate =      "Fri Dec 16 11:05:47 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
}

@Article{Prieto:2011:MCM,
  author =       "Pablo Prieto and Valentin Puente and Jose-Angel
                 Gregorio",
  title =        "Multilevel Cache Modeling for Chip-Multiprocessor
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper presents a simple analytical model for
                 predicting on-chip cache hierarchy effectiveness in
                 chip multiprocessors (CMP) for a state-of-the-art
                 architecture. Given the complexity of this type of
                 systems, we use rough approximations, such as the
                 empirical observation that the re-reference timing
                 pattern follows a power law and the assumption of a
                 simplistic delay model for the cache, in order to
                 provide a useful model for the memory hierarchy
                 responsiveness. This model enables the analytical
                 determination of average access time, which makes
                 design space pruning useful before sweeping the vast
                 design space of this class of systems. The model is
                 also useful for predicting cache hierarchy behavior in
                 future systems. The fidelity of the model has been
                 validated using a state-of-the-art, full-system
                 simulation environment, on a system with up to sixteen
                 out-of-order processors with cache-coherent caches and
                 using a broad spectrum of applications, including
                 complex multithread workloads. This simple model can
                 predict a near-to-optimal, on-chip cache distribution
                 while also estimating how future systems running future
                 applications might behave.",
  acknowledgement = ack-nhfb,
  affiliation =  "Prieto, P (Reprint Author), Univ Cantabria, Cantabria,
                 Spain. Prieto, Pablo; Puente, Valentin; Gregorio,
                 Jose-Angel, Univ Cantabria, Cantabria, Spain.",
  author-email = "prietop@unican.es vpuente@unican.es
                 monaster@unican.es",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Innovation
                 [TIN2010-18159]; HiPEAC2 European Network of
                 Excellence",
  funding-text = "This work has been supported by the Spanish Ministry
                 of Science and Innovation, under contracts
                 TIN2010-18159, and by the HiPEAC2 European Network of
                 Excellence. The authors would like to thank the
                 reviewers for their valuable comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "13",
  ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente,
                 Valentin/0000-0002-6904-3282 Gregorio, Jose
                 Angel/0000-0003-2214-303X",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Prieto:2011:MCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Reddy:2011:BFH,
  author =       "Dheeraj Reddy and David Koufaty and Paul Brett and
                 Scott Hahn",
  title =        "Bridging functional heterogeneity in multicore
                 architectures",
  journal =      j-OPER-SYS-REV,
  volume =       "45",
  number =       "1",
  pages =        "21--33",
  month =        jan,
  year =         "2011",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1945023.1945028",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Feb 25 16:43:23 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Heterogeneous processors that mix big high performance
                 cores with small low power cores promise excellent
                 single-threaded performance coupled with high
                 multi-threaded throughput and higher
                 performance-per-watt. A significant portion of the
                 commercial multicore heterogeneous processors are
                 likely to have a common instruction set architecture(
                 ISA). However, due to limited design resources and
                 goals, each core is likely to contain ISA extensions
                 not yet implemented in the other core. Therefore, such
                 heterogeneous processors will have inherent functional
                 asymmetry at the ISA level and face significant
                 software challenges. This paper analyzes the software
                 challenges to the operating system and the application
                 layer software on a heterogeneous system with
                 functional asymmetry, where the ISA of the small and
                 big cores overlaps.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Roy:2011:SRP,
  author =       "Soumyaroop Roy and Nagarajan Ranganathan and Srinivas
                 Katkoori",
  title =        "State-Retentive Power Gating of Register Files in
                 Multicore Processors Featuring Multithreaded In-Order
                 Cores",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "60",
  number =       "11",
  pages =        "1547--1560",
  month =        nov,
  year =         "2011",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2010.249",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Sep 27 07:57:50 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5669257",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Schonherr:2011:MTI,
  author =       "M. Sch{\"o}nherr and K. Kucher and M. Geier and M.
                 Stiebler and S. Freudiger and M. Krafczyk",
  title =        "Multi-thread implementations of the lattice
                 {Boltzmann} method on non-uniform grids for {CPUs} and
                 {GPUs}",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "61",
  number =       "12",
  pages =        "3730--3743",
  month =        jun,
  year =         "2011",
  CODEN =        "CMAPDK",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Mar 1 21:50:48 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computmathappl2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122111002999",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Tu:2011:MBM,
  author =       "Xuping Tu and Hai Jin and Zhibin Yu and Jie Chen and
                 Yabin Hu and Xie Xia",
  title =        "{MT-BTRIMER}: A master-slave multi-threaded dynamic
                 binary translator",
  journal =      j-INT-J-COMPUT-SYST-SCI-ENG,
  volume =       "26",
  number =       "5",
  pages =        "??--??",
  month =        sep,
  year =         "2011",
  CODEN =        "CSSEEI",
  ISSN =         "0267-6192",
  ISSN-L =       "0267-6192",
  bibdate =      "Tue Dec 3 12:04:33 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computsystscieng.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Computer Systems Science and
                 Engineering",
}

@Article{VanDeGeijn:2011:HPD,
  author =       "Robert A. {Van De Geijn} and Field G. {Van Zee}",
  title =        "High-performance up-and-downdating via
                 {Householder}-like transformations",
  journal =      j-TOMS,
  volume =       "38",
  number =       "1",
  pages =        "4:1--4:17",
  month =        nov,
  year =         "2011",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2049662.2049666",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Dec 15 08:59:34 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "We present high-performance algorithms for
                 up-and-downdating a Cholesky factor or QR
                 factorization. The method uses Householder-like
                 transformations, sometimes called hyperbolic
                 Householder transformations, that are accumulated so
                 that most computation can be cast in terms of
                 high-performance matrix-matrix operations. The
                 resulting algorithms can then be used as building
                 blocks for an algorithm-by-blocks that allows
                 computation to be conveniently scheduled to
                 multithreaded architectures like multicore processors.
                 Performance is shown to be similar to that achieved by
                 a blocked QR factorization via Householder
                 transformations.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Vandierendonck:2011:FMM,
  author =       "Hans Vandierendonck and Andre Seznec",
  title =        "Fairness Metrics for Multi-Threaded Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "4--7",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multi-threaded processors execute multiple threads
                 concurrently in order to increase overall throughput.
                 It is well documented that multi-threading affects
                 per-thread performance but, more importantly, some
                 threads are affected more than others. This is
                 especially troublesome for multi-programmed workloads.
                 Fairness metrics measure whether all threads are
                 affected equally. However defining equal treatment is
                 not straightforward. Several fairness metrics for
                 multi-threaded processors have been utilized in the
                 literature, although there does not seem to be a
                 consensus on what metric does the best job of measuring
                 fairness. This paper reviews the prevalent fairness
                 metrics and analyzes their main properties. Each metric
                 strikes a different trade-off between fairness in the
                 strict sense and throughput. We categorize the metrics
                 with respect to this property. Based on experimental
                 data for SMT processors, we suggest using the minimum
                 fairness metric in order to balance fairness and
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Vandierendonck, H (Reprint Author), Univ Ghent, Dept
                 Elect \& Informat Syst, Ghent, Belgium. Vandierendonck,
                 Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent,
                 Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.",
  author-email = "hans.vandierendonck@elis.ugent.be
                 Andre.Seznec@inria.fr",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "fairness; measurement; multi-programming;
                 Multi-threaded processors; quality-of-service",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "13",
  unique-id =    "Vandierendonck:2011:FMM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vandierendonck:2011:MSR,
  author =       "Hans Vandierendonck and Andr{\'e} Seznec",
  title =        "Managing {SMT} resource usage through speculative
                 instruction window weighting",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2019608.2019611",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simultaneous multithreading processors dynamically
                 share processor resources between multiple threads. In
                 general, shared SMT resources may be managed
                 explicitly, for instance, by dynamically setting queue
                 occupation bounds for each thread as in the DCRA and
                 Hill-Climbing policies. Alternatively, resources may be
                 managed implicitly; that is, resource usage is
                 controlled by placing the desired instruction mix in
                 the resources. In this case, the main resource
                 management tool is the instruction fetch policy which
                 must predict the behavior of each thread (branch
                 mispredictions, long-latency loads, etc.) as it fetches
                 instructions. In this article, we present the use of
                 Speculative Instruction Window Weighting (SIWW) to
                 bridge the gap between implicit and explicit SMT fetch
                 policies.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yu:2011:SDH,
  author =       "Wing-kei S. Yu and Ruirui Huang and Sarah Q. Xu and
                 Sung-En Wang and Edwin Kan and G. Edward Suh",
  title =        "{SRAM--DRAM} hybrid memory with applications to
                 efficient register files in fine-grained
                 multi-threading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "247--258",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000094",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Zhao:2011:DCC,
  author =       "Qin Zhao and David Koh and Syed Raza and Derek
                 Bruening and Weng-Fai Wong and Saman Amarasinghe",
  title =        "Dynamic cache contention detection in multi-threaded
                 applications",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "27--38",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952688",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Zhu:2011:TPS,
  author =       "David (Yu) Zhu and Jaeyeon Jung and Dawn Song and
                 Tadayoshi Kohno and David Wetherall",
  title =        "{TaintEraser}: protecting sensitive data leaks using
                 application-level taint tracking",
  journal =      j-OPER-SYS-REV,
  volume =       "45",
  number =       "1",
  pages =        "142--154",
  month =        jan,
  year =         "2011",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1945023.1945039",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Feb 25 16:43:23 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We present TaintEraser, a new tool that tracks the
                 movement of sensitive user data as it flows through
                 off-the-shelf applications. TaintEraser uses
                 application-level dynamic taint analysis to let users
                 run applications in their own environment while
                 preventing unwanted information exposure. It is made
                 possible by techniques we developed for accurate and
                 efficient tainting: (1) Semantic-aware
                 instruction-level tainting is critical to track taint
                 accurately, without explosion or loss. (2) Function
                 summaries provide an interface to handle taint
                 propagation within the kernel and reduce the overhead
                 of instruction-level tracking. (3) On-demand
                 instrumentation enables fast loading of large
                 applications. Together, these techniques let us analyze
                 large, multi-threaded, networked applications in near
                 real-time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGOPS Operating Systems Review",
}

@Article{Zhuang:2011:CST,
  author =       "Xiaotong Zhuang and Santosh Pande",
  title =        "Compiler-Supported Thread Management for Multithreaded
                 Network Processors",
  journal =      j-TECS,
  volume =       "10",
  number =       "4",
  pages =        "44:1--44:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043662.2043668",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Mon Dec 19 15:49:06 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Traditionally, runtime management involving CPU
                 sharing, real-time scheduling, etc., is provided by the
                 runtime environment (typically an operating system)
                 using hardware support such as timers and interrupts.
                 However, due to stringent performance requirements on
                 network processors, neither OS nor hardware mechanisms
                 are typically feasible/available. Mapping packet
                 processing tasks on network processors involves complex
                 trade-offs to maximize parallelism and pipelining. Due
                 to an increase in the size of the code store and
                 complexity of application requirements, network
                 processors are being programmed with heterogeneous
                 threads that may execute code belonging to different
                 tasks on a given micro-engine. Also, most network
                 applications are streaming applications that are
                 typically processed in a pipelined fashion.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
}

@Article{Ahn:2012:ISE,
  author =       "Jung Ho Ahn and Norman P. Jouppi and Christos
                 Kozyrakis and Jacob Leverich and Robert S. Schreiber",
  title =        "Improving System Energy Efficiency with Memory Rank
                 Subsetting",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133382.2133386",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "VLSI process technology scaling has enabled dramatic
                 improvements in the capacity and peak bandwidth of DRAM
                 devices. However, current standard DDR x DIMM memory
                 interfaces are not well tailored to achieve high energy
                 efficiency and performance in modern
                 chip-multiprocessor-based computer systems. Their
                 suboptimal performance and energy inefficiency can have
                 a significant impact on system-wide efficiency since
                 much of the system power dissipation is due to memory
                 power. New memory interfaces, better suited for future
                 many-core systems, are needed. In response, there are
                 recent proposals to enhance the energy efficiency of
                 main-memory systems by dividing a memory rank into
                 subsets, and making a subset rather than a whole rank
                 serve a memory request. We holistically assess the
                 effectiveness of rank subsetting from system-wide
                 performance, energy-efficiency, and reliability
                 perspectives. We identify the impact of rank subsetting
                 on memory power and processor performance analytically,
                 compare two promising rank-subsetting proposals,
                 Multicore DIMM and mini-rank, and verify our analysis
                 by simulating a chip-multiprocessor system using
                 multithreaded and consolidated workloads. We extend the
                 design of Multicore DIMM for high-reliability systems
                 and show that compared with conventional chipkill
                 approaches, rank subsetting can lead to much higher
                 system-level energy efficiency and performance at the
                 cost of additional DRAM devices. This holistic
                 assessment shows that rank subsetting offers compelling
                 alternatives to existing processor-memory interfaces
                 for future DDR systems.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Aliaga:2012:SDG,
  author =       "Jos{\'e} I. Aliaga and Paolo Bientinesi and Davor
                 Davidovi{\'c} and Edoardo {Di Napoli} and Francisco D.
                 Igual and Enrique S. Quintana-Ort{\'\i}",
  title =        "Solving dense generalized eigenproblems on
                 multi-threaded architectures",
  journal =      j-APPL-MATH-COMP,
  volume =       "218",
  number =       "22",
  pages =        "11279--11289",
  day =          "15",
  month =        jul,
  year =         "2012",
  CODEN =        "AMHCBQ",
  DOI =          "https://doi.org/10.1016/j.amc.2012.05.020",
  ISSN =         "0096-3003 (print), 1873-5649 (electronic)",
  ISSN-L =       "0096-3003",
  bibdate =      "Mon Jun 25 12:18:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/applmathcomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/00963003",
  URL =          "http://www.sciencedirect.com/science/article/pii/S009630031200505X",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Mathematics and Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00963003",
}

@Article{Arnau:2012:BMG,
  author =       "Jos{\'e}-Mar{\'\i}a Arnau and Joan-Manuel Parcerisa
                 and Polychronis Xekalakis",
  title =        "Boosting mobile {GPU} performance with a decoupled
                 access\slash execute fragment processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "84--93",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337169",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Smartphones represent one of the fastest growing
                 markets, providing significant hardware/software
                 improvements every few months. However, supporting
                 these capabilities reduces the operating time per
                 battery charge. The CPU/GPU component is only left with
                 a shrinking fraction of the power budget, since most of
                 the energy is consumed by the screen and the antenna.
                 In this paper, we focus on improving the energy
                 efficiency of the GPU since graphical applications
                 consist an important part of the existing market.
                 Moreover, the trend towards better screens will
                 inevitably lead to a higher demand for improved
                 graphics rendering. We show that the main bottleneck
                 for these applications is the texture cache and that
                 traditional techniques for hiding memory latency
                 (prefetching, multithreading) do not work well or come
                 at a high energy cost. We thus propose the migration of
                 GPU designs towards the decoupled access-execute
                 concept. Furthermore, we significantly reduce bandwidth
                 usage in the decoupled architecture by exploiting
                 inter-core data sharing. Using commercial Android
                 applications, we show that the end design can achieve
                 93\% of the performance of a heavily multithreaded GPU
                 while providing energy savings of 34\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Baghsorkhi:2012:EPE,
  author =       "Sara S. Baghsorkhi and Isaac Gelado and Matthieu
                 Delahaye and Wen-mei W. Hwu",
  title =        "Efficient performance evaluation of memory hierarchy
                 for highly multithreaded graphics processors",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "23--34",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145820",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "With the emergence of highly multithreaded
                 architectures, performance monitoring techniques face
                 new challenges in efficiently locating sources of
                 performance discrepancies in the program source code.
                 For example, the state-of-the-art performance counters
                 in highly multithreaded graphics processing units
                 (GPUs) report only the overall occurrences of
                 microarchitecture events at the end of program
                 execution. Furthermore, even if supported, any
                 fine-grained sampling of performance counters will
                 distort the actual program behavior and will make the
                 sampled values inaccurate. On the other hand, it is
                 difficult to achieve high resolution performance
                 information at low sampling rates in the presence of
                 thousands of concurrently running threads. In this
                 paper, we present a novel software-based approach for
                 monitoring the memory hierarchy performance in highly
                 multithreaded general-purpose graphics processors. The
                 proposed analysis is based on memory traces collected
                 for snapshots of an application execution. A
                 trace-based memory hierarchy model with a Monte Carlo
                 experimental methodology generates statistical bounds
                 of performance measures without being concerned about
                 the exact inter-thread ordering of individual events
                 but rather studying the behavior of the overall system.
                 The statistical approach overcomes the classical
                 problem of disturbed execution timing due to
                 fine-grained instrumentation. The approach scales well
                 as we deploy an efficient parallel trace collection
                 technique to reduce the trace generation overhead and a
                 simple memory hierarchy model to reduce the simulation
                 time. The proposed scheme also keeps track of
                 individual memory operations in the source code and can
                 quantify their efficiency with respect to the memory
                 system. A cross-validation of our results shows close
                 agreement with the values read from the hardware
                 performance counters on an NVIDIA Tesla C2050 GPU.
                 Based on the high resolution profile data produced by
                 our model we optimized memory accesses in the sparse
                 matrix vector multiply kernel and achieved speedups
                 ranging from 2.4 to 14.8 depending on the
                 characteristics of the input matrices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Bouajjani:2012:ARP,
  author =       "Ahmed Bouajjani and Michael Emmi",
  title =        "Analysis of recursively parallel programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "203--214",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103681",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "We propose a general formal model of isolated
                 hierarchical parallel computations, and identify
                 several fragments to match the concurrency constructs
                 present in real-world programming languages such as
                 Cilk and X10. By associating fundamental formal models
                 (vector addition systems with recursive transitions) to
                 each fragment, we provide a common platform for
                 exposing the relative difficulties of algorithmic
                 reasoning. For each case we measure the complexity of
                 deciding state-reachability for finite-data recursive
                 programs, and propose algorithms for the decidable
                 cases. The complexities which include PTIME, NP,
                 EXPSPACE, and 2EXPTIME contrast with undecidable
                 state-reachability for recursive multi-threaded
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Burgess:2012:EFL,
  author =       "David Burgess and Edmund Gieske and James Holt and
                 Thomas Hoy and Gary Whisenhunt",
  title =        "{e6500}: {Freescale}'s Low-Power, High-Performance
                 Multithreaded Embedded Processor",
  journal =      j-IEEE-MICRO,
  volume =       "32",
  number =       "5",
  pages =        "26--36",
  month =        sep # "\slash " # oct,
  year =         "2012",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2012.55",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Nov 15 05:59:33 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Burnim:2012:SCS,
  author =       "Jacob Burnim and George Necula and Koushik Sen",
  title =        "Specifying and checking semantic atomicity for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "79--90",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In practice, it is quite difficult to write correct
                 multithreaded programs due to the potential for
                 unintended and nondeterministic interference between
                 parallel threads. A fundamental correctness property
                 for such programs is atomicity---a block of code in a
                 program is atomic if, for any parallel execution of the
                 program, there is an execution with the same overall
                 program behavior in which the block is executed
                 serially. We propose semantic atomicity, a
                 generalization of atomicity with respect to a
                 programmer-defined notion of equivalent behavior. We
                 propose an assertion framework in which a programmer
                 can use bridge predicates to specify noninterference
                 properties at the level of abstraction of their
                 application. Further, we propose a novel algorithm for
                 systematically testing atomicity specifications on
                 parallel executions with a bounded number of
                 interruptions---i.e. atomic blocks whose execution is
                 interleaved with that of other threads. We further
                 propose a set of sound heuristics and optional user
                 annotations that increase the efficiency of checking
                 atomicity specifications in the common case where the
                 specifications hold. We have implemented our assertion
                 framework for specifying and checking semantic
                 atomicity for parallel Java programs, and we have
                 written semantic atomicity specifications for a number
                 of benchmarks. We found that using bridge predicates
                 allowed us to specify the natural and intended atomic
                 behavior of a wider range of programs than did previous
                 approaches. Further, in checking our specifications, we
                 found several previously unknown bugs, including in the
                 widely-used java.util.concurrent library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Catalyurek:2012:GCA,
  author =       "{\"U}mit V. {\c{C}}ataly{\"u}rek and John Feo and
                 Assefaw H. Gebremedhin and Mahantesh Halappanavar and
                 Alex Pothen",
  title =        "Graph coloring algorithms for multi-core and massively
                 multithreaded architectures",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "10--11",
  pages =        "576--594",
  month =        oct # "\slash " # nov,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2012.07.001",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Thu Oct 25 09:00:31 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.sciencedirect.com/science/journal/01678191",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819112000592",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@InProceedings{Chen:2012:CLA,
  author =       "Guancheng Chen and Per Stenstrom",
  title =        "Critical lock analysis: diagnosing critical section
                 bottlenecks in multithreaded applications",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "71:1--71:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a099.pdf",
  abstract =     "Critical sections are well known potential performance
                 bottlenecks in multithreaded applications and
                 identifying the ones that inhibit scalability are
                 important for performance optimizations. While previous
                 approaches use idle time as a key measure, we show such
                 a measure is not reliable. The reason is that idleness
                 does not necessarily mean the critical section is on
                 the critical path. We introduce critical lock analysis,
                 a new method for diagnosing critical section
                 bottlenecks in multithreaded applications. Our method
                 firstly identifies the critical sections appearing on
                 the critical path, and then quantifies the impact of
                 such critical sections on the overall performance by
                 using quantitative performance metrics. Case studies
                 show that our method can successfully identify critical
                 sections that are most beneficial for improving overall
                 performance as well as quantify their performance
                 impact on the critical path, which results in a more
                 reliable establishment of the inherent critical section
                 bottlenecks than previous approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "71",
}

@Article{Chen:2012:MLS,
  author =       "Chih-Yuan Chen and Jhong-Yi Ciou and Rong-Guey Chang",
  title =        "Multi-level simultaneous multithreading scheduling to
                 reduce the temperature of register files",
  journal =      j-CCPE,
  volume =       "24",
  number =       "12",
  pages =        "1296--1316",
  day =          "25",
  month =        aug,
  year =         "2012",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1831",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Mon Nov 5 07:44:51 MST 2012",
  bibsource =    "http://www.interscience.wiley.com/jpages/1532-0626;
                 https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "22 Sep 2011",
}

@Article{Clements:2012:SAS,
  author =       "Austin T. Clements and M. Frans Kaashoek and Nickolai
                 Zeldovich",
  title =        "Scalable address spaces using {RCU} balanced trees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "199--210",
  month =        mar,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2189750.2150998",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Software developers commonly exploit multicore
                 processors by building multithreaded software in which
                 all threads of an application share a single address
                 space. This shared address space has a cost: kernel
                 virtual memory operations such as handling soft page
                 faults, growing the address space, mapping files, etc.
                 can limit the scalability of these applications. In
                 widely-used operating systems, all of these operations
                 are synchronized by a single per-process lock. This
                 paper contributes a new design for increasing the
                 concurrency of kernel operations on a shared address
                 space by exploiting read-copy-update (RCU) so that soft
                 page faults can both run in parallel with operations
                 that mutate the same address space and avoid contending
                 with other page faults on shared cache lines. To enable
                 such parallelism, this paper also introduces an
                 RCU-based binary balanced tree for storing memory
                 mappings. An experimental evaluation using three
                 multithreaded applications shows performance
                 improvements on 80 cores ranging from 1.7x to 3.4x for
                 an implementation of this design in the Linux 2.6.37
                 kernel. The RCU-based binary tree enables soft page
                 faults to run at a constant cost with an increasing
                 number of cores,suggesting that the design will scale
                 well beyond 80 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Devietti:2012:RAS,
  author =       "Joseph Devietti and Benjamin P. Wood and Karin Strauss
                 and Luis Ceze and Dan Grossman and Shaz Qadeer",
  title =        "{RADISH}: always-on sound and complete
                 {{\underline{Ra}ce \underline{D}etection \underline{i}n
                 \underline{S}oftware and \underline{H}ardware}}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "201--212",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337182",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Data-race freedom is a valuable safety property for
                 multithreaded programs that helps with catching bugs,
                 simplifying memory consistency model semantics, and
                 verifying and enforcing both atomicity and determinism.
                 Unfortunately, existing software-only dynamic race
                 detectors are precise but slow; proposals with hardware
                 support offer higher performance but are imprecise.
                 Both precision and performance are necessary to achieve
                 the many advantages always-on dynamic race detection
                 could provide. To resolve this trade-off, we propose
                 Radish, a hybrid hardware-software dynamic race
                 detector that is always-on and fully precise. In
                 Radish, hardware caches a principled subset of the
                 metadata necessary for race detection; this subset
                 allows the vast majority of race checks to occur
                 completely in hardware. A flexible software layer
                 handles persistence of race detection metadata on cache
                 evictions and occasional queries to this expanded set
                 of metadata. We show that Radish is correct by proving
                 equivalence to a conventional happens-before race
                 detector. Our design has modest hardware complexity:
                 caches are completely unmodified and we piggy-back on
                 existing coherence messages but do not otherwise modify
                 the protocol. Furthermore, Radish can leverage
                 type-safe languages to reduce overheads substantially.
                 Our evaluation of a simulated 8-core Radish processor
                 using PARSEC benchmarks shows runtime overheads from
                 negligible to 2x, outperforming the leading
                 software-only race detector by 2x-37x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Devietti:2012:RRC,
  author =       "Joseph Devietti and Jacob Nelson and Tom Bergan and
                 Luis Ceze and Dan Grossman",
  title =        "{RCDC}: a relaxed consistency deterministic computer",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "67--78",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Providing deterministic execution significantly
                 simplifies the debugging, testing, replication, and
                 deployment of multithreaded programs. Recent work has
                 developed deterministic multiprocessor architectures as
                 well as compiler and runtime systems that enforce
                 determinism in current hardware. Such work has
                 incidentally imposed strong memory-ordering properties.
                 Historically, memory ordering has been relaxed in favor
                 of higher performance in shared memory multiprocessors
                 and, interestingly, determinism exacerbates the cost of
                 strong memory ordering. Consequently, we argue that
                 relaxed memory ordering is vital to achieving faster
                 deterministic execution. This paper introduces RCDC, a
                 deterministic multiprocessor architecture that takes
                 advantage of relaxed memory orderings to provide
                 high-performance deterministic execution with low
                 hardware complexity. RCDC has two key innovations: a
                 hybrid HW/SW approach to enforcing determinism; and a
                 new deterministic execution strategy that leverages
                 data-race-free-based memory models (e.g., the models
                 for Java and C++) to improve performance and
                 scalability without sacrificing determinism, even in
                 the presence of races. In our hybrid HW/SW approach,
                 the only hardware mechanisms required are
                 software-controlled store buffering and support for
                 precise instruction counting; we do not require
                 speculation. A runtime system uses these mechanisms to
                 enforce determinism for arbitrary programs. We evaluate
                 RCDC using PARSEC benchmarks and show that relaxing
                 memory ordering leads to performance and scalability
                 close to nondeterministic execution without requiring
                 any form of speculation. We also compare our new
                 execution strategy to one based on TSO
                 (total-store-ordering) and show that some applications
                 benefit significantly from the extra relaxation. We
                 also evaluate a software-only implementation of our new
                 deterministic execution strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '12 conference proceedings.",
}

@InProceedings{Ding:2012:CDF,
  author =       "Wei Ding and Yuanrui Zhang and Mahmut Kandemir and
                 Seung Woo Son",
  title =        "Compiler-directed file layout optimization for
                 hierarchical storage systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "41:1--41:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a030.pdf",
  abstract =     "File layout of array data is a critical factor that
                 effects the behavior of storage caches, and has so far
                 taken not much attention in the context of hierarchical
                 storage systems. The main contribution of this paper is
                 a compiler-driven file layout optimization scheme for
                 hierarchical storage caches. This approach, fully
                 automated within an optimizing compiler, analyzes a
                 multi-threaded application code and determines a file
                 layout for each disk-resident array referenced by the
                 code, such that the performance of the target storage
                 cache hierarchy is maximized. We tested our approach
                 using 16 I/O intensive application programs and
                 compared its performance against two previously
                 proposed approaches under different cache space
                 management schemes. Our experimental results show that
                 the proposed approach improves the execution time of
                 these parallel applications by 23.7\% on average.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
}

@Article{Dolby:2012:DCA,
  author =       "Julian Dolby and Christian Hammer and Daniel Marino
                 and Frank Tip and Mandana Vaziri and Jan Vitek",
  title =        "A data-centric approach to synchronization",
  journal =      j-TOPLAS,
  volume =       "34",
  number =       "1",
  pages =        "4:1--4:48",
  month =        apr,
  year =         "2012",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2160910.2160913",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Mon Apr 30 17:20:50 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Concurrency-related errors, such as data races, are
                 frustratingly difficult to track down and eliminate in
                 large object-oriented programs. Traditional approaches
                 to preventing data races rely on protecting instruction
                 sequences with synchronization operations. Such
                 control-centric approaches are inherently brittle, as
                 the burden is on the programmer to ensure that all
                 concurrently accessed memory locations are consistently
                 protected. Data-centric synchronization is an
                 alternative approach that offloads some of the work on
                 the language implementation. Data-centric
                 synchronization groups fields of objects into atomic
                 sets to indicate that these fields must always be
                 updated atomically. Each atomic set has associated
                 units of work, that is, code fragments that preserve
                 the consistency of that atomic set. Synchronization
                 operations are added automatically by the compiler. We
                 present an extension to the Java programming language
                 that integrates annotations for data-centric
                 concurrency control. The resulting language, called AJ,
                 relies on a type system that enables separate
                 compilation and supports atomic sets that span multiple
                 objects and that also supports full encapsulation for
                 more efficient code generation. We evaluate our
                 proposal by refactoring classes from standard
                 libraries, as well as a number of multithreaded
                 benchmarks, to use atomic sets. Our results suggest
                 that data-centric synchronization is easy to use and
                 enjoys low annotation overhead, while successfully
                 preventing data races. Moreover, experiments on the
                 SPECjbb benchmark suggest that acceptable performance
                 can be achieved with a modest amount of tuning.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Esmaeilzadeh:2012:LBL,
  author =       "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen
                 M. Blackburn and Kathryn S. McKinley",
  title =        "Looking back on the language and hardware revolutions:
                 measured power, performance, and scaling",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "319--332",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950402",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper reports and analyzes measured chip power
                 and performance on five process technology generations
                 executing 61 diverse benchmarks with a rigorous
                 methodology. We measure representative Intel IA32
                 processors with technologies ranging from 130nm to 32nm
                 while they execute sequential and parallel benchmarks
                 written in native and managed languages. During this
                 period, hardware and software changed substantially:
                 (1) hardware vendors delivered chip multiprocessors
                 instead of uniprocessors, and independently (2)
                 software developers increasingly chose managed
                 languages instead of native languages. This
                 quantitative data reveals the extent of some known and
                 previously unobserved hardware and software trends. Two
                 themes emerge. (I) Workload: The power, performance,
                 and energy trends of native workloads do not
                 approximate managed workloads. For example, (a) the
                 SPEC CPU2006 native benchmarks on the i7 (45) and i5
                 (32) draw significantly less power than managed or
                 scalable native benchmarks; and (b) managed runtimes
                 exploit parallelism even when running single-threaded
                 applications. The results recommend architects always
                 include native and managed workloads when designing and
                 evaluating energy efficient hardware. (II)
                 Architecture: Clock scaling, microarchitecture,
                 simultaneous multithreading, and chip multiprocessors
                 each elicit a huge variety of power, performance, and
                 energy responses. This variety and the difficulty of
                 obtaining power measurements recommends exposing
                 on-chip power meters and when possible structure
                 specific power meters for cores, caches, and other
                 structures. Just as hardware event counters provide a
                 quantitative grounding for performance innovations,
                 power meters are necessary for optimizing energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Eyerman:2012:PMJ,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Probabilistic modeling for job symbiosis scheduling on
                 {SMT} processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2207222.2207223",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Symbiotic job scheduling improves simultaneous
                 multithreading (SMT) processor performance by
                 coscheduling jobs that have ``compatible'' demands on
                 the processor's shared resources. Existing approaches
                 however require a sampling phase, evaluate a limited
                 number of possible coschedules, use heuristics to gauge
                 symbiosis, are rigid in their optimization target, and
                 do not preserve system-level priorities/shares. This
                 article proposes probabilistic job symbiosis modeling,
                 which predicts whether jobs will create positive or
                 negative symbiosis when coscheduled without requiring
                 the coschedule to be evaluated. The model, which uses
                 per-thread cycle stacks computed through a previously
                 proposed cycle accounting architecture, is simple
                 enough to be used in system software. Probabilistic job
                 symbiosis modeling provides six key innovations over
                 prior work in symbiotic job scheduling: (i) it does not
                 require a sampling phase, (ii) it readjusts the job
                 coschedule continuously, (iii) it evaluates a large
                 number of possible coschedules at very low overhead,
                 (iv) it is not driven by heuristics, (v) it can
                 optimize a performance target of interest (e.g., system
                 throughput or job turnaround time), and (vi) it
                 preserves system-level priorities/shares. These
                 innovations make symbiotic job scheduling both
                 practical and effective. Our experimental evaluation,
                 which assumes a realistic scenario in which jobs come
                 and go, reports an average 16\% (and up to 35\%)
                 reduction in job turnaround time compared to the
                 previously proposed SOS (sample, optimize, symbios)
                 approach for a two-thread SMT processor, and an average
                 19\% (and up to 45\%) reduction in job turnaround time
                 for a four-thread SMT processor.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Farzan:2012:VPC,
  author =       "Azadeh Farzan and Zachary Kincaid",
  title =        "Verification of parameterized concurrent programs by
                 modular reasoning about data and control",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "297--308",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103693",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "In this paper, we consider the problem of verifying
                 thread-state properties of multithreaded programs in
                 which the number of active threads cannot be statically
                 bounded. Our approach is based on decomposing the task
                 into two modules, where one reasons about data and the
                 other reasons about control. The data module computes
                 thread-state invariants (e.g., linear constraints over
                 global variables and local variables of one thread)
                 using the thread interference information computed by
                 the control module. The control module computes a
                 representation of thread interference, as an
                 incrementally constructed data flow graph, using the
                 data invariants provided by the data module. These
                 invariants are used to rule out patterns of thread
                 interference that can not occur in a real program
                 execution. The two modules are incorporated into a
                 feedback loop, so that the abstractions of data and
                 interference are iteratively coarsened as the algorithm
                 progresses (that is, they become weaker) until a fixed
                 point is reached. Our approach is sound and
                 terminating, and applicable to programs with infinite
                 state (e.g., unbounded integers) and unboundedly many
                 threads. The verification method presented in this
                 paper has been implemented into a tool, called Duet. We
                 demonstrate the effectiveness of our technique by
                 verifying properties of a selection of Linux device
                 drivers using Duet, and also compare Duet with previous
                 work on verification of parameterized Boolean program
                 using the Boolean abstractions of these drivers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Foltzer:2012:MSP,
  author =       "Adam Foltzer and Abhishek Kulkarni and Rebecca Swords
                 and Sajith Sasidharan and Eric Jiang and Ryan Newton",
  title =        "A meta-scheduler for the par-monad: composable
                 scheduling for the heterogeneous cloud",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "235--246",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern parallel computing hardware demands
                 increasingly specialized attention to the details of
                 scheduling and load balancing across heterogeneous
                 execution resources that may include GPU and cloud
                 environments, in addition to traditional CPUs. Many
                 existing solutions address the challenges of particular
                 resources, but do so in isolation, and in general do
                 not compose within larger systems. We propose a
                 general, composable abstraction for execution
                 resources, along with a continuation-based
                 meta-scheduler that harnesses those resources in the
                 context of a deterministic parallel programming library
                 for Haskell. We demonstrate performance benefits of
                 combined CPU/GPU scheduling over either alone, and of
                 combined multithreaded/distributed scheduling over
                 existing distributed programming approaches for
                 Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ICFP '12 conference proceedings.",
}

@InProceedings{Garland:2012:DUP,
  author =       "Michael Garland and Manjunath Kudlur and Yili Zheng",
  title =        "Designing a unified programming model for
                 heterogeneous machines",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "67:1--67:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a064.pdf",
  abstract =     "While high-efficiency machines are increasingly
                 embracing heterogeneous architectures and massive
                 multithreading, contemporary mainstream programming
                 languages reflect a mental model in which processing
                 elements are homogeneous, concurrency is limited, and
                 memory is a flat undifferentiated pool of storage.
                 Moreover, the current state of the art in programming
                 heterogeneous machines tends towards using separate
                 programming models, such as OpenMP and CUDA, for
                 different portions of the machine. Both of these
                 factors make programming emerging heterogeneous
                 machines unnecessarily difficult. We describe the
                 design of the Phalanx programming model, which seeks to
                 provide a unified programming model for heterogeneous
                 machines. It provides constructs for bulk parallelism,
                 synchronization, and data placement which operate
                 across the entire machine. Our prototype implementation
                 is able to launch and coordinate work on both CPU and
                 GPU processors within a single node, and by leveraging
                 the GASNet runtime, is able to run across all the nodes
                 of a distributed-memory machine.",
  acknowledgement = ack-nhfb,
  articleno =    "67",
}

@Article{Gebhart:2012:HTS,
  author =       "Mark Gebhart and Daniel R. Johnson and David Tarjan
                 and Stephen W. Keckler and William J. Dally and Erik
                 Lindholm and Kevin Skadron",
  title =        "A Hierarchical Thread Scheduler and Register File for
                 Energy-Efficient Throughput Processors",
  journal =      j-TOCS,
  volume =       "30",
  number =       "2",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2166879.2166882",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 27 12:10:22 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Modern graphics processing units (GPUs) employ a large
                 number of hardware threads to hide both function unit
                 and memory access latency. Extreme multithreading
                 requires a complex thread scheduler as well as a large
                 register file, which is expensive to access both in
                 terms of energy and latency. We present two
                 complementary techniques for reducing energy on
                 massively-threaded processors such as GPUs. First, we
                 investigate a two-level thread scheduler that maintains
                 a small set of active threads to hide ALU and local
                 memory access latency and a larger set of pending
                 threads to hide main memory latency. Reducing the
                 number of threads that the scheduler must consider each
                 cycle improves the scheduler's energy efficiency.
                 Second, we propose replacing the monolithic register
                 file found on modern designs with a hierarchical
                 register file. We explore various trade-offs for the
                 hierarchy including the number of levels in the
                 hierarchy and the number of entries at each level. We
                 consider both a hardware-managed caching scheme and a
                 software-managed scheme, where the compiler is
                 responsible for orchestrating all data movement within
                 the register file hierarchy. Combined with a
                 hierarchical register file, our two-level thread
                 scheduler provides a further reduction in energy by
                 only allocating entries in the upper levels of the
                 register file hierarchy for active threads. Averaging
                 across a variety of real world graphics and compute
                 workloads, the active thread count can be reduced by a
                 factor of 4 with minimal impact on performance and our
                 most efficient three-level software-managed register
                 file hierarchy reduces register file energy by 54\%.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Grebenshchikov:2012:SSV,
  author =       "Sergey Grebenshchikov and Nuno P. Lopes and Corneliu
                 Popeea and Andrey Rybalchenko",
  title =        "Synthesizing software verifiers from proof rules",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "405--416",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254112",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Automatically generated tools can significantly
                 improve programmer productivity. For example, parsers
                 and dataflow analyzers can be automatically generated
                 from declarative specifications in the form of
                 grammars, which tremendously simplifies the task of
                 implementing a compiler. In this paper, we present a
                 method for the automatic synthesis of software
                 verification tools. Our synthesis procedure takes as
                 input a description of the employed proof rule, e.g.,
                 program safety checking via inductive invariants, and
                 produces a tool that automatically discovers the
                 auxiliary assertions required by the proof rule, e.g.,
                 inductive loop invariants and procedure summaries. We
                 rely on a (standard) representation of proof rules
                 using recursive equations over the auxiliary
                 assertions. The discovery of auxiliary assertions,
                 i.e., solving the equations, is based on an iterative
                 process that extrapolates solutions obtained for
                 finitary unrollings of equations. We show how our
                 method synthesizes automatic safety and liveness
                 verifiers for programs with procedures, multi-threaded
                 programs, and functional programs. Our experimental
                 comparison of the resulting verifiers with existing
                 state-of-the-art verification tools confirms the
                 practicality of the approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Halappanavar:2012:AWM,
  author =       "Mahantesh Halappanavar and John Feo and Oreste Villa
                 and Antonino Tumeo and Alex Pothen",
  title =        "Approximate weighted matching on emerging manycore and
                 multithreaded architectures",
  journal =      j-IJHPCA,
  volume =       "26",
  number =       "4",
  pages =        "413--430",
  month =        nov,
  year =         "2012",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342012452893",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Thu Nov 8 11:31:16 MST 2012",
  bibsource =    "http://hpc.sagepub.com/content/26/4.toc;
                 https://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://hpc.sagepub.com/content/26/4/413.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "August 9, 2012",
}

@Article{Hayden:2012:KEG,
  author =       "Christopher M. Hayden and Edward K. Smith and Michail
                 Denchev and Michael Hicks and Jeffrey S. Foster",
  title =        "{Kitsune}: efficient, general-purpose dynamic software
                 updating for {C}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "249--264",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384635",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic software updating (DSU) systems allow programs
                 to be updated while running, thereby permitting
                 developers to add features and fix bugs without
                 downtime. This paper introduces Kitsune, a new DSU
                 system for C whose design has three notable features.
                 First, Kitsune's updating mechanism updates the whole
                 program, not individual functions. This mechanism is
                 more flexible than most prior approaches and places no
                 restrictions on data representations or allowed
                 compiler optimizations. Second, Kitsune makes the
                 important aspects of updating explicit in the program
                 text, making the program's semantics easy to understand
                 while minimizing programmer effort. Finally, the
                 programmer can write simple specifications to direct
                 Kitsune to generate code that traverses and transforms
                 old-version state for use by new code; such state
                 transformation is often necessary, and is significantly
                 more difficult in prior DSU systems. We have used
                 Kitsune to update five popular, open-source, single-
                 and multi-threaded programs, and find that few program
                 changes are required to use Kitsune, and that it incurs
                 essentially no performance overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Huang:2012:EPS,
  author =       "Jeff Huang and Charles Zhang",
  title =        "Execution privatization for scheduler-oblivious
                 concurrent programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "737--752",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384670",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Making multithreaded execution less non-deterministic
                 is a promising solution to address the difficulty of
                 concurrent programming plagued by the non-deterministic
                 thread scheduling. In fact, a vast category of
                 concurrent programs are scheduler-oblivious: their
                 execution is deterministic, regardless of the
                 scheduling behavior. We present and formally prove a
                 fundamental observation of the privatizability property
                 for scheduler-oblivious programs, that paves the
                 theoretical foundation for privatizing shared data
                 accesses on a path segment. With privatization, the
                 non-deterministic thread interleavings on the
                 privatized accesses are isolated and as the consequence
                 many concurrency problems are alleviated. We further
                 present a path and context sensitive privatization
                 algorithm that safely privatizes the program without
                 introducing any additional program behavior. Our
                 evaluation results show that the privatization
                 opportunity pervasively exists in real world large
                 complex concurrent systems. Through privatization,
                 several real concurrency bugs are fixed and notable
                 performance improvements are also achieved on
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Joao:2012:BIS,
  author =       "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu
                 and Yale N. Patt",
  title =        "Bottleneck identification and scheduling in
                 multithreaded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "223--234",
  month =        mar,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2189750.2151001",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Performance of multithreaded applications is limited
                 by a variety of bottlenecks, e.g. critical sections,
                 barriers and slow pipeline stages. These bottlenecks
                 serialize execution, waste valuable execution cycles,
                 and limit scalability of applications. This paper
                 proposes Bottleneck Identification and Scheduling in
                 Multithreaded Applications (BIS), a cooperative
                 software-hardware mechanism to identify and accelerate
                 the most critical bottlenecks. BIS identifies which
                 bottlenecks are likely to reduce performance by
                 measuring the number of cycles threads have to wait for
                 each bottleneck, and accelerates those bottlenecks
                 using one or more fast cores on an Asymmetric Chip
                 Multi-Processor (ACMP). Unlike previous work that
                 targets specific bottlenecks, BIS can identify and
                 accelerate bottlenecks regardless of their type. We
                 compare BIS to four previous approaches and show that
                 it outperforms the best of them by 15\% on average.
                 BIS' performance improvement increases as the number of
                 cores and the number of fast cores in the system
                 increase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Joisha:2012:TTE,
  author =       "Pramod G. Joisha and Robert S. Schreiber and
                 Prithviraj Banerjee and Hans-J. Boehm and Dhruva R.
                 Chakrabarti",
  title =        "On a Technique for Transparently Empowering Classical
                 Compiler Optimizations on Multithreaded Code",
  journal =      j-TOPLAS,
  volume =       "34",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2012",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2220365.2220368",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jun 29 17:33:40 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "A large body of data-flow analyses exists for
                 analyzing and optimizing sequential code.
                 Unfortunately, much of it cannot be directly applied on
                 parallel code, for reasons of correctness. This article
                 presents a technique to automatically, aggressively,
                 yet safely apply sequentially-sound data-flow
                 transformations, without change, on shared-memory
                 programs. The technique is founded on the notion of
                 program references being ``siloed'' on certain
                 control-flow paths. Intuitively, siloed references are
                 free of interference from other threads within the
                 confines of such paths. Data-flow transformations can,
                 in general, be unblocked on siloed references. The
                 solution has been implemented in a widely used
                 compiler. Results on benchmarks from SPLASH-2 show that
                 performance improvements of up to 41\% are possible,
                 with an average improvement of 6\% across all the
                 tested programs over all thread counts.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Kambadur:2012:HCA,
  author =       "Melanie Kambadur and Kui Tang and Martha A. Kim",
  title =        "{Harmony}: collection and analysis of parallel block
                 vectors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "452--463",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337211",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Efficient execution of well-parallelized applications
                 is central to performance in the multicore era. Program
                 analysis tools support the hardware and software sides
                 of this effort by exposing relevant features of
                 multithreaded applications. This paper describes
                 parallel block vectors, which uncover previously unseen
                 characteristics of parallel programs. Parallel block
                 vectors provide block execution profiles per
                 concurrency phase (e.g., the block execution profile of
                 all serial regions of a program). This information
                 provides a direct and fine-grained mapping between an
                 application's runtime parallel phases and the static
                 code that makes up those phases. This paper also
                 demonstrates how to collect parallel block vectors with
                 minimal application perturbation using Harmony. Harmony
                 is an instrumentation pass for the LLVM compiler that
                 introduces just 16-21\% overhead on average across
                 eight Parsec benchmarks. We apply parallel block
                 vectors to uncover several novel insights about
                 parallel applications with direct consequences for
                 architectural design. First, that the serial and
                 parallel phases of execution used in Amdahl's Law are
                 often composed of many of the same basic blocks.
                 Second, that program features, such as instruction mix,
                 vary based on the degree of parallelism, with serial
                 phases in particular displaying different instruction
                 mixes from the program as a whole. Third, that dynamic
                 execution frequencies do not necessarily correlate with
                 a block's parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Kawaguchi:2012:DPL,
  author =       "Ming Kawaguchi and Patrick Rondon and Alexander Bakst
                 and Ranjit Jhala",
  title =        "Deterministic parallelism via liquid effects",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "45--54",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254071",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Shared memory multithreading is a popular approach to
                 parallel programming, but also fiendishly hard to get
                 right. We present Liquid Effects, a type-and-effect
                 system based on refinement types which allows for
                 fine-grained, low-level, shared memory multi-threading
                 while statically guaranteeing that a program is
                 deterministic. Liquid Effects records the effect of an
                 expression as a for- mula in first-order logic, making
                 our type-and-effect system highly expressive. Further,
                 effects like Read and Write are recorded in Liquid
                 Effects as ordinary uninterpreted predicates, leaving
                 the effect system open to extension by the user. By
                 building our system as an extension to an existing
                 dependent refinement type system, our system gains
                 precise value- and branch-sensitive reasoning about
                 effects. Finally, our system exploits the Liquid Types
                 refinement type inference technique to automatically
                 infer refinement types and effects. We have implemented
                 our type-and-effect checking techniques in CSOLVE, a
                 refinement type inference system for C programs. We
                 demonstrate how CSOLVE uses Liquid Effects to prove the
                 determinism of a variety of benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Keckler:2012:MMC,
  author =       "Stephen W. Keckler and Steven K. Reinhardt",
  title =        "Massively Multithreaded Computing Systems",
  journal =      j-COMPUTER,
  volume =       "45",
  number =       "8",
  pages =        "24--25",
  month =        aug,
  year =         "2012",
  CODEN =        "CPTRB4",
  DOI =          "https://doi.org/10.1109/MC.2012.270",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Wed Aug 29 16:38:07 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@InProceedings{Khan:2012:MAN,
  author =       "Arif M. Khan and David F. Gleich and Alex Pothen and
                 Mahantesh Halappanavar",
  title =        "A multithreaded algorithm for network alignment via
                 approximate matching",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "64:1--64:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a054.pdf",
  abstract =     "Network alignment is an optimization problem to find
                 the best one-to-one map between the vertices of a pair
                 of graphs that overlaps as many edges as possible. It
                 is a relaxation of the graph isomorphism problem and is
                 closely related to the subgraph isomorphism problem.
                 The best current approaches are entirely heuristic and
                 iterative in nature. They generate real-valued
                 heuristic weights that must be rounded to find integer
                 solutions. This rounding requires solving a bipartite
                 maximum weight matching problem at each iteration in
                 order to avoid missing high quality solutions. We
                 investigate substituting a parallel, half-approximation
                 for maximum weight matching instead of an exact
                 computation. Our experiments show that the resulting
                 difference in solution quality is negligible. We
                 demonstrate almost a 20-fold speedup using 40 threads
                 on an 8 processor Intel Xeon E7-8870 system and now
                 solve real-world problems in 36 seconds instead of 10
                 minutes.",
  acknowledgement = ack-nhfb,
  articleno =    "64",
}

@Article{Khyzha:2012:AP,
  author =       "Artem Khyzha and Pavel Par{\'\i}zek and Corina S.
                 P{\u{a}}s{\u{a}}reanu",
  title =        "Abstract pathfinder",
  journal =      j-SIGSOFT,
  volume =       "37",
  number =       "6",
  pages =        "1--5",
  month =        nov,
  year =         "2012",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/2382756.2382794",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:16:18 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib",
  abstract =     "We present Abstract Pathfinder, an extension to the
                 Java Pathfinder (JPF) verification tool-set that
                 supports data abstraction to reduce the large data
                 domains of a Java program to small, finite abstract
                 domains, making the program more amenable to
                 verification. We use data abstraction to compute an
                 over-approximation of the original program in such a
                 way that if a (safety) property is true in the
                 abstracted program the property is also true in the
                 original program. Our approach enhances JPF with an
                 abstract interpreter and abstract state-matching
                 mechanisms, together with a library of abstractions
                 from which the user can pick which abstractions to use
                 for a particular application. We discuss the details of
                 our implementation together with some preliminary
                 experiments with analyzing multi-threaded Java
                 programs, where Abstract Pathfinder achieves
                 significant time and memory savings as compared with
                 plain JPF.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Kyle:2012:EPI,
  author =       "Stephen Kyle and Igor B{\"o}hm and Bj{\"o}rn Franke
                 and Hugh Leather and Nigel Topham",
  title =        "Efficiently parallelizing instruction set simulation
                 of embedded multi-core processors using region-based
                 just-in-time dynamic binary translation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "21--30",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248422",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Embedded systems, as typified by modern mobile phones,
                 are already seeing a drive toward using multi-core
                 processors. The number of cores will likely increase
                 rapidly in the future. Engineers and researchers need
                 to be able to simulate systems, as they are expected to
                 be in a few generations time, running simulations of
                 many-core devices on today's multi-core machines. These
                 requirements place heavy demands on the scalability of
                 simulation engines, the fastest of which have typically
                 evolved from just-in-time (Jit) dynamic binary
                 translators (Dbt). Existing work aimed at parallelizing
                 Dbt simulators has focused exclusively on trace-based
                 Dbt, wherein linear execution traces or perhaps trees
                 thereof are the units of translation. Region-based Dbt
                 simulators have not received the same attention and
                 require different techniques than their trace-based
                 cousins. In this paper we develop an innovative
                 approach to scaling multi-core, embedded simulation
                 through region-based Dbt. We initially modify the Jit
                 code generator of such a simulator to emit code that
                 does not depend on a particular thread with its
                 thread-specific context and is, therefore,
                 thread-agnostic. We then demonstrate that this
                 thread-agnostic code generation is comparable to
                 thread-specific code with respect to performance, but
                 also enables the sharing of JIT-compiled regions
                 between different threads. This sharing optimisation,
                 in turn, leads to significant performance improvements
                 for multi-threaded applications. In fact, our results
                 confirm that an average of 76\% of all JIT-compiled
                 regions can be shared between 128 threads in
                 representative, parallel workloads. We demonstrate that
                 this translates into an overall performance improvement
                 by 1.44x on average and up to 2.40x across 12
                 multi-threaded benchmarks taken from the Splash-2
                 benchmark suite, targeting our high-performance
                 multi-core Dbt simulator for embedded Arc processors
                 running on a 4-core Intel host machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Lakshminarayana:2012:DSP,
  author =       "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon
                 Kim and Jinwoo Shin",
  title =        "{DRAM} Scheduling Policy for {GPGPU} Architectures
                 Based on a Potential Function",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "33--36",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "GPGPU architectures (applications) have several
                 different characteristics compared to traditional CPU
                 architectures (applications): highly multithreaded
                 architectures and SIMD-execution behavior are the two
                 important characteristics of GPGPU computing. In this
                 paper, we propose a potential function that models the
                 DRAM behavior in GPGPU architectures and a DRAM
                 scheduling policy, alpha-SJF policy to minimize the
                 potential function. The scheduling policy essentially
                 chooses between SJF and FR-FCFS at run-time based on
                 the number of requests from each thread and whether the
                 thread has a row buffer hit.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lakshminarayana, NB (Reprint Author), Georgia Inst
                 Technol, Sch Comp Sci, Atlanta, GA 30332 USA.
                 Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon;
                 Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci,
                 Atlanta, GA 30332 USA.",
  author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu
                 hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "DRAM scheduling; GPGPU; Potential function",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  researcherid-numbers = "Shin, Jinwoo/M-5389-2013",
  times-cited =  "7",
  unique-id =    "Lakshminarayana:2012:DSP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Leiserson:2012:DPR,
  author =       "Charles E. Leiserson and Tao B. Schardl and Jim
                 Sukha",
  title =        "Deterministic parallel random-number generation for
                 dynamic-multithreading platforms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "193--204",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145841",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Existing concurrency platforms for dynamic
                 multithreading do not provide repeatable parallel
                 random-number generators. This paper proposes that a
                 mechanism called pedigrees be built into the runtime
                 system to enable efficient deterministic parallel
                 random-number generation. Experiments with the
                 open-source MIT Cilk runtime system show that the
                 overhead for maintaining pedigrees is negligible.
                 Specifically, on a suite of 10 benchmarks, the relative
                 overhead of Cilk with pedigrees to the original Cilk
                 has a geometric mean of less than 1\%. We persuaded
                 Intel to modify its commercial C/C++ compiler, which
                 provides the Cilk Plus concurrency platform, to include
                 pedigrees, and we built a library implementation of a
                 deterministic parallel random-number generator called
                 DotMix that compresses the pedigree and then
                 ``RC6-mixes'' the result. The statistical quality of
                 DotMix is comparable to that of the popular Mersenne
                 twister, but somewhat slower than a nondeterministic
                 parallel version of this efficient and high-quality
                 serial random-number generator. The cost of calling
                 DotMix depends on the ``spawn depth'' of the
                 invocation. For a naive Fibonacci calculation with n=40
                 that calls DotMix in every node of the computation,
                 this ``price of determinism'' is a factor of 2.65 in
                 running time, but for more realistic applications with
                 less intense use of random numbers --- such as a
                 maximal-independent-set algorithm, a practical
                 samplesort program, and a Monte Carlo discrete-hedging
                 application from QuantLib --- the observed ``price''
                 was less than 5\%. Moreover, even if overheads were
                 several times greater, applications using DotMix should
                 be amply fast for debugging purposes, which is a major
                 reason for desiring repeatability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Li:2012:MRP,
  author =       "Xin Li and Reinhard von Hanxleden",
  title =        "Multithreaded Reactive Programming --- the {Kiel
                 Esterel} Processor",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "61",
  number =       "3",
  pages =        "337--349",
  month =        mar,
  year =         "2012",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2010.246",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Fri Feb 3 07:35:03 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Ling:2012:HPP,
  author =       "Cheng Ling and Khaled Benkrid and Tsuyoshi Hamada",
  title =        "High performance phylogenetic analysis on
                 {CUDA}-compatible {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "52--57",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460226",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "The operation of phylogenetic analysis aims to
                 investigate the evolution and relationships among
                 species. It is widely used in the fields of system
                 biology and comparative genomics. However, phylogenetic
                 analysis is also a computationally intensive operation
                 as the number of tree topology grows in a factorial way
                 with the number of species involved. Therefore, due to
                 the large number of species in the real world, the
                 computational burden has largely thwarted phylogenetic
                 reconstruction. In this paper, we describe the detailed
                 GPU-based multi-threaded design and implementation of a
                 Markov Chain Monte Carlo (MCMC) maximum likelihood
                 algorithm for phylogenetic analysis on a set of aligned
                 nucleotide sequences. The implementation is based on
                 the framework of the most widely used phylogenetic
                 analysis tool, namely MrBayes. The proposed approach
                 resulted in 6x-8x speed-up on an NVidia Geforce 460 GTX
                 GPU compared to an optimized GPP-based software
                 implementation running on a desktop computer with a
                 single Intel Xeon 2.53 GHz CPU and 6.0 GB RAM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Liu:2012:FPA,
  author =       "Gu Liu and Hong An and Wenting Han and Xiaoqiang Li
                 and Tao Sun and Wei Zhou and Xuechao Wei and Xulong
                 Tang",
  title =        "{FlexBFS}: a parallelism-aware implementation of
                 breadth-first search on {GPU}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "279--280",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145853",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we present FlexBFS, a parallelism-aware
                 implementation for breadth-first search on GPU. Our
                 implementation can adjust the computation resources
                 according to the feedback of available parallelism
                 dynamically. We also optimized our program in three
                 ways: (1)a simplified two-level queue management,(2)a
                 combined kernel strategy and (3)a high-degree vertices
                 specialization approach. Our experimental results show
                 that it can achieve 3~20 times speedup against the
                 fastest serial version, and can outperform the TBB
                 based multi-threading CPU version and the previous most
                 effective GPU version on all types of input graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Mars:2012:BDS,
  author =       "Jason Mars and Naveen Kumar",
  title =        "{BlockChop}: dynamic squash elimination for hybrid
                 processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "536--547",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337221",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Hybrid processors are HW/SW co-designed processors
                 that leverage blocked-execution, the execution of
                 regions of instructions as atomic blocks, to facilitate
                 aggressive speculative optimization. As we move to a
                 multicore hybrid design, fine grained conflicts for
                 shared data can violate the atomicity requirement of
                 these blocks and lead to expensive squashes and
                 rollbacks. However, as these atomic regions differ from
                 those used in checkpointing and transactional memory
                 systems, the extent of this potentially prohibitive
                 problem remains unclear, and mechanisms to mitigate
                 these squashes dynamically may be critical to enable a
                 highly per-formant multicore hybrid design. In this
                 work, we investigate how multithreaded applications,
                 both benchmark and commercial workloads, are affected
                 by squashes, and present dynamic mechanisms for
                 mitigating these squashes in hybrid processors. While
                 the current wisdom is that there is not a significant
                 number of squashes for smaller atomic regions, we
                 observe this is not the case for many multithreaded
                 workloads. With region sizes of just 200--500
                 instructions, we observe a performance degradation
                 ranging from 10\% to more than 50\% for workloads with
                 a mixture of shared reads and writes. By harnessing the
                 unique flexibility provided by the software subsystem
                 of hybrid processor design, we present BlockChop, a
                 framework for dynamically mitigating squashes on
                 multicore hybrid processors. We present a range of
                 squash handling mechanisms leveraging retrials,
                 interpretation, and retranslation, and find that
                 BlockChop is quite effective. Over the current response
                 to exceptions and squashes in a hybrid design, we are
                 able to improve the performance of benchmark and
                 commercial workloads by 1.4x and 1.2x on average for
                 large and small region sizes respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Miller:2012:VCE,
  author =       "Timothy N. Miller and Renji Thomas and Xiang Pan and
                 Radu Teodorescu",
  title =        "{VRSync}: characterizing and eliminating
                 synchronization-induced voltage emergencies in
                 many-core processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "249--260",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337188",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Power consumption is a primary concern for
                 microprocessor designers. Lowering the supply voltage
                 of processors is one of the most effective techniques
                 for improving their energy efficiency. Unfortunately,
                 low-voltage operation faces multiple challenges going
                 forward. One such challenge is increased sensitivity to
                 voltage fluctuations, which can trigger so-called
                 ``voltage emergencies'' that can lead to errors. These
                 fluctuations are caused by abrupt changes in power
                 demand, triggered by processor activity variation as a
                 function of workload. This paper examines the effects
                 of voltage fluctuations on future many-core processors.
                 With the increase in the number of cores in a chip, the
                 effects of chip-wide activity fluctuation --- such as
                 that caused by global synchronization in multithreaded
                 applications --- overshadow the effects of core-level
                 workload variability. Starting from this observation,
                 we developed VRSync, a novel synchronization
                 methodology that uses emergency-aware scheduling
                 policies that reduce the slope of load fluctuations,
                 eliminating emergencies. We show that VRSync is very
                 effective at eliminating emergencies, allowing voltage
                 guardbands to be significantly lowered, which reduces
                 energy consumption by an average of 33\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Nagarakatte:2012:MAP,
  author =       "Santosh Nagarakatte and Sebastian Burckhardt and Milo
                 M. K. Martin and Madanlal Musuvathi",
  title =        "Multicore acceleration of priority-based schedulers
                 for concurrency bug detection",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "543--554",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Testing multithreaded programs is difficult as threads
                 can interleave in a nondeterministic fashion. Untested
                 interleavings can cause failures, but testing all
                 interleavings is infeasible. Many interleaving
                 exploration strategies for bug detection have been
                 proposed, but their relative effectiveness and
                 performance remains unclear as they often lack publicly
                 available implementations and have not been evaluated
                 using common benchmarks. We describe NeedlePoint, an
                 open-source framework that allows selection and
                 comparison of a wide range of interleaving exploration
                 policies for bug detection proposed by prior work. Our
                 experience with NeedlePoint indicates that
                 priority-based probabilistic concurrency testing (the
                 PCT algorithm) finds bugs quickly, but it runs only one
                 thread at a time, which destroys parallelism by
                 serializing executions. To address this problem we
                 propose a parallel version of the PCT algorithm~(PPCT).
                 We show that the new algorithm outperforms the original
                 by a factor of 5x when testing parallel programs on an
                 eight-core machine. We formally prove that parallel PCT
                 provides the same probabilistic coverage guarantees as
                 PCT. Moreover, PPCT is the first algorithm that runs
                 multiple threads while providing coverage guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Nagpal:2012:CGE,
  author =       "Rahul Nagpal and Anasua Bhowmik",
  title =        "Criticality guided energy aware speculation for
                 speculative multithreaded processors",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "38",
  number =       "6--7",
  pages =        "329--341",
  month =        jun # "\slash " # jul,
  year =         "2012",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2012.03.002",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Sun May 20 09:14:24 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 http://www.sciencedirect.com/science/journal/01678191",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819112000191",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Oh:2012:MTS,
  author =       "Doohwan Oh and Won W. Ro",
  title =        "Multi-Threading and Suffix Grouping on Massive
                 Multiple Pattern Matching Algorithm",
  journal =      j-COMP-J,
  volume =       "55",
  number =       "11",
  pages =        "1331--1346",
  month =        nov,
  year =         "2012",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxs002",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Thu Nov 1 11:25:36 MDT 2012",
  bibsource =    "http://comjnl.oxfordjournals.org/content/55/11.toc;
                 https://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://comjnl.oxfordjournals.org/content/55/11/1331.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "The Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "February 2, 2012",
}

@InProceedings{Olivier:2012:CMW,
  author =       "Stephen L. Olivier and Bronis R. de Supinski and
                 Martin Schulz and Jan F. Prins",
  title =        "Characterizing and mitigating work time inflation in
                 task parallel programs",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "65:1--65:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a066.pdf",
  abstract =     "Task parallelism raises the level of abstraction in
                 shared memory parallel programming to simplify the
                 development of complex applications. However, task
                 parallel applications can exhibit poor performance due
                 to thread idleness, scheduling overheads, and work time
                 inflation --- additional time spent by threads in a
                 multithreaded computation beyond the time required to
                 perform the same work in a sequential computation. We
                 identify the contributions of each factor to lost
                 efficiency in various task parallel OpenMP applications
                 and diagnose the causes of work time inflation in those
                 applications. Increased data access latency can cause
                 significant work time inflation in NUMA systems. Our
                 locality framework for task parallel OpenMP programs
                 mitigates this cause of work time inflation. Our
                 extensions to the Qthreads library demonstrate that
                 locality-aware scheduling can improve performance up to
                 3X compared to the Intel OpenMP task scheduler.",
  acknowledgement = ack-nhfb,
  articleno =    "65",
}

@InProceedings{Preissl:2012:CSS,
  author =       "Robert Preissl and Theodore M. Wong and Pallab Datta
                 and Myron Flickner and Raghavendra Singh and Steven K.
                 Esser and William P. Risk and Horst D. Simon and
                 Dharmendra S. Modha",
  title =        "{Compass}: a scalable simulator for an architecture
                 for cognitive computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "54:1--54:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  URL =          "http://conferences.computer.org/sc/2012/papers/1000a085.pdf",
  abstract =     "Inspired by the function, power, and volume of the
                 organic brain, we are developing TrueNorth, a novel
                 modular, non-von Neumann, ultra-low power, compact
                 architecture. TrueNorth consists of a scalable network
                 of neurosynaptic cores, with each core containing
                 neurons, dendrites, synapses, and axons. To set sail
                 for TrueNorth, we developed Compass, a multi-threaded,
                 massively parallel functional simulator and a parallel
                 compiler that maps a network of long-distance pathways
                 in the macaque monkey brain to TrueNorth. We
                 demonstrate near-perfect weak scaling on a 16 rack
                 IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB
                 memory), achieving an unprecedented scale of 256
                 million neurosynaptic cores containing 65 billion
                 neurons and 16 trillion synapses running only 388X
                 slower than real time with an average spiking rate of
                 8.1 Hz. By using emerging PGAS communication
                 primitives, we also demonstrate 2X better real-time
                 performance over MPI primitives on a 4 rack Blue Gene/P
                 (16384 CPUs, 16 TB memory).",
  acknowledgement = ack-nhfb,
  articleno =    "54",
}

@Article{Pusukuri:2012:TTD,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "Thread Tranquilizer: Dynamically reducing performance
                 variation",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086725",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To realize the performance potential of multicore
                 systems, we must effectively manage the interactions
                 between memory reference behavior and the operating
                 system policies for thread scheduling and migration
                 decisions. We observe that these interactions lead to
                 significant variations in the performance of a given
                 application, from one execution to the next, even when
                 the program input remains unchanged and no other
                 applications are being run on the system. Our
                 experiments with multithreaded programs, including the
                 TATP database application, SPECjbb2005, and a subset of
                 PARSEC and SPEC OMP programs, on a 24-core Dell
                 PowerEdge R905 server running OpenSolaris confirms the
                 above observation.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Quintana-Orti:2012:RSP,
  author =       "Gregorio Quintana-Ort{\'\i} and Francisco D. Igual and
                 Mercedes Marqu{\'e}s and Enrique S. Quintana-Ort{\'\i}
                 and Robert A. van de Geijn",
  title =        "A Runtime System for Programming Out-of-Core Matrix
                 Algorithms-by-Tiles on Multithreaded Architectures",
  journal =      j-TOMS,
  volume =       "38",
  number =       "4",
  pages =        "25:1--25:25",
  month =        aug,
  year =         "2012",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2331130.2331133",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Aug 30 18:55:10 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "Out-of-core implementations of algorithms for dense
                 matrix computations have traditionally focused on
                 optimal use of memory so as to minimize I/O, often
                 trading programmability for performance. In this
                 article we show how the current state of hardware and
                 software allows the programmability problem to be
                 addressed without sacrificing performance. This comes
                 from the realizations that memory is cheap and large,
                 making it less necessary to optimally orchestrate I/O,
                 and that new algorithms view matrices as collections of
                 submatrices and computation as operations with those
                 submatrices. This enables libraries to be coded at a
                 high level of abstraction, leaving the tasks of
                 scheduling the computations and data movement in the
                 hands of a runtime system. This is in sharp contrast to
                 more traditional approaches that leverage optimal use
                 of in-core memory and, at the expense of introducing
                 considerable programming complexity, explicit overlap
                 of I/O with computation. Performance is demonstrated
                 for this approach on multicore architectures as well as
                 platforms equipped with hardware accelerators.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Radojkovic:2012:EIS,
  author =       "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud
                 Grasset and Eduardo Qui{\~n}ones and Sami Yehia and
                 Francisco J. Cazorla",
  title =        "On the evaluation of the impact of shared resources in
                 multithreaded {COTS} processors in time-critical
                 environments",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086696.2086713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Commercial Off-The-Shelf (COTS) processors are now
                 commonly used in real-time embedded systems. The
                 characteristics of these processors fulfill system
                 requirements in terms of time-to-market, low cost, and
                 high performance-per-watt ratio. However, multithreaded
                 (MT) processors are still not widely used in real-time
                 systems because the timing analysis is too complex. In
                 MT processors, simultaneously-running tasks share and
                 compete for processor resources, so the timing analysis
                 has to estimate the possible impact that the inter-task
                 interferences have on the execution time of the
                 applications. In this paper, we propose a method that
                 quantifies the slowdown that simultaneously-running
                 tasks may experience due to collision in shared
                 processor resources.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Radojkovic:2012:OTA,
  author =       "Petar Radojkovi{\'c} and Vladimir Cakarevi{\'c} and
                 Miquel Moret{\'o} and Javier Verd{\'u} and Alex Pajuelo
                 and Francisco J. Cazorla and Mario Nemirovsky and Mateo
                 Valero",
  title =        "Optimal task assignment in multithreaded processors: a
                 statistical approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "235--248",
  month =        mar,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2189750.2151002",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The introduction of massively multithreaded (MMT)
                 processors, comprised of a large number of cores with
                 many shared resources, has made task scheduling, in
                 particular task to hardware thread assignment, one of
                 the most promising ways to improve system performance.
                 However, finding an optimal task assignment for a
                 workload running on MMT processors is an NP-complete
                 problem. Due to the fact that the performance of the
                 best possible task assignment is unknown, the room for
                 improvement of current task-assignment algorithms
                 cannot be determined. This is a major problem for the
                 industry because it could lead to: (1)~A waste of
                 resources if excessive effort is devoted to improving a
                 task assignment algorithm that already provides a
                 performance that is close to the optimal one, or
                 (2)~significant performance loss if insufficient effort
                 is devoted to improving poorly-performing task
                 assignment algorithms. In this paper, we present a
                 method based on Extreme Value Theory that allows the
                 prediction of the performance of the optimal task
                 assignment in MMT processors. We further show that
                 executing a sample of several hundred or several
                 thousand random task assignments is enough to obtain,
                 with very high confidence, an assignment with a
                 performance that is close to the optimal one. We
                 validate our method with an industrial case study for a
                 set of multithreaded network applications running on an
                 UltraSPARC~T2 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Reda:2012:APC,
  author =       "Sherief Reda and Ryan Cochran and Ayse K. Coskun",
  title =        "Adaptive Power Capping for Servers with Multithreaded
                 Workloads",
  journal =      j-IEEE-MICRO,
  volume =       "32",
  number =       "5",
  pages =        "64--75",
  month =        sep # "\slash " # oct,
  year =         "2012",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2012.59",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Nov 15 05:59:33 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Rivara:2012:MPL,
  author =       "Maria-Cecilia Rivara and Pedro Rodriguez and Rafael
                 Montenegro and Gaston Jorquera",
  title =        "Multithread parallelization of {Lepp}-bisection
                 algorithms",
  journal =      j-APPL-NUM-MATH,
  volume =       "62",
  number =       "4",
  pages =        "473--488",
  month =        apr,
  year =         "2012",
  CODEN =        "ANMAEL",
  DOI =          "https://doi.org/10.1016/j.apnum.2011.07.011",
  ISSN =         "0168-9274 (print), 1873-5460 (electronic)",
  ISSN-L =       "0168-9274",
  bibdate =      "Thu Mar 8 07:24:47 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/applnummath.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/01689274",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0168927411001292",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Numerical Mathematics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01689274",
}

@Article{Sartor:2012:EMT,
  author =       "Jennfer B. Sartor and Lieven Eeckhout",
  title =        "Exploring multi-threaded {Java} application
                 performance on multicore hardware",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "281--296",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384638",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "While there have been many studies of how to schedule
                 applications to take advantage of increasing numbers of
                 cores in modern-day multicore processors, few have
                 focused on multi-threaded managed language applications
                 which are prevalent from the embedded to the server
                 domain. Managed languages complicate performance
                 studies because they have additional virtual machine
                 threads that collect garbage and dynamically compile,
                 closely interacting with application threads. Further
                 complexity is introduced as modern multicore machines
                 have multiple sockets and dynamic frequency scaling
                 options, broadening opportunities to reduce both power
                 and running time. In this paper, we explore the
                 performance of Java applications, studying how best to
                 map application and virtual machine (JVM) threads to a
                 multicore, multi-socket environment. We explore both
                 the cost of separating JVM threads from application
                 threads, and the opportunity to speed up or slow down
                 the clock frequency of isolated threads. We perform
                 experiments with the multi-threaded DaCapo benchmarks
                 and pseudojbb2005 running on the Jikes Research Virtual
                 Machine, on a dual-socket, 8-core Intel Nehalem machine
                 to reveal several novel, and sometimes
                 counter-intuitive, findings. We believe these insights
                 are a first but important step towards understanding
                 and optimizing managed language performance on modern
                 hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Sharafeddine:2012:DOE,
  author =       "Mageda Sharafeddine and Komal Jothi and Haitham
                 Akkary",
  title =        "Disjoint out-of-order execution processor",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355592",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-performance superscalar architectures used to
                 exploit instruction level parallelism in single-thread
                 applications have become too complex and power hungry
                 for the multicore processors era. We propose a new
                 architecture that uses multiple small latency-tolerant
                 out-of-order cores to improve single-thread
                 performance. Improving single-thread performance with
                 multiple small out-of-order cores allows designers to
                 place more of these cores on the same die.
                 Consequently, emerging highly parallel applications can
                 take full advantage of the multicore parallel hardware
                 without sacrificing performance of inherently serial
                 and hard to parallelize applications. Our architecture
                 combines speculative multithreading (SpMT) with
                 checkpoint recovery and continual flow pipeline
                 architectures. It splits single-thread program
                 execution into disjoint control and data threads that
                 execute concurrently on multiple cooperating small and
                 latency-tolerant out-of-order cores. Hence we call this
                 style of execution Disjoint Out-of-Order Execution
                 (DOE). DOE uses latency tolerance to overcome
                 performance issues of SpMT caused by interthread data
                 dependences. To evaluate this architecture, we have
                 developed a microarchitecture performance model of DOE
                 based on PTLSim, a simulation infrastructure of the x86
                 instruction set architecture. We evaluate the potential
                 performance of DOE processor architecture using a
                 simple heuristic to fork control independent threads in
                 hardware at the target addresses of future procedure
                 return instructions. Using applications from SpecInt
                 2000, we study DOE under ideal as well as realistic
                 architectural constraints. We discuss the performance
                 impact of key DOE architecture and application
                 variables such as number of cores, interthread data
                 dependences, intercore data communication delay,
                 buffers capacity, and branch mispredictions. Without
                 any DOE specific compiler optimizations, our results
                 show that DOE outperforms conventional SpMT
                 architectures by 15\%, on average. We also show that
                 DOE with four small cores can perform on average
                 equally well to a large superscalar core, consuming
                 about the same power. Most importantly, DOE improves
                 throughput performance by a significant amount over a
                 large superscalar core, up to 2.5 times, when running
                 multitasking applications.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shirole:2012:TCU,
  author =       "Mahesh Shirole and Rajeev Kumar",
  title =        "Testing for concurrency in {UML} diagrams",
  journal =      j-SIGSOFT,
  volume =       "37",
  number =       "5",
  pages =        "1--8",
  month =        sep,
  year =         "2012",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/2347696.2347712",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:16:16 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib",
  abstract =     "Concurrent programming is increasingly being used in
                 many applications with the advent of multi-cores. The
                 necessary support for execution of multi-threading is
                 getting richer. Notwithstanding, a concurrent program
                 may behave nondeterministically, it may result in
                 different outputs with the same input in different
                 runs. The aim of this study is to generate test
                 sequences for concurrency from unified modelling
                 language (UML) behavioral models such as sequence and
                 activity diagrams. Generating exhaustive test cases for
                 all concurrent interleaving sequences is exponential in
                 size. Therefore, it is necessary to find adequate test
                 cases in presence of concurrency to uncover errors due
                 to, e.g., data race, synchronization and deadlocks. In
                 order to generate adequate test cases a novel search
                 algorithm, which we call concurrent queue search (CQS)
                 is proposed. The CQS handles random nature of
                 concurrent tasks. To generate test scenarios, a
                 sequence diagram is converted into an activity diagram.
                 An activity diagram encapsulates sequential,
                 conditional, iterative and concurrent ows of the
                 control. By the experimental results, it was observed
                 that test sequences generated by CQS algorithm are
                 superior as compared to DFS and BFS search
                 algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Singh:2012:EES,
  author =       "Abhayendra Singh and Satish Narayanasamy and Daniel
                 Marino and Todd Millstein and Madanlal Musuvathi",
  title =        "End-to-end sequential consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "524--535",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337220",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Sequential consistency (SC) is arguably the most
                 intuitive behavior for a shared-memory multithreaded
                 program. It is widely accepted that language-level SC
                 could significantly improve programmability of a
                 multiprocessor system. However, efficiently supporting
                 end-to-end SC remains a challenge as it requires that
                 both compiler and hardware optimizations preserve SC
                 semantics. While a recent study has shown that a
                 compiler can preserve SC semantics for a small
                 performance cost, an efficient and complexity-effective
                 SC hardware remains elusive. Past hardware solutions
                 relied on aggressive speculation techniques, which has
                 not yet been realized in a practical implementation.
                 This paper exploits the observation that hardware need
                 not enforce any memory model constraints on accesses to
                 thread-local and shared read-only locations. A
                 processor can easily determine a large fraction of
                 these safe accesses with assistance from static
                 compiler analysis and the hardware memory management
                 unit. We discuss a low-complexity hardware design that
                 exploits this information to reduce the overhead in
                 ensuring SC. Our design employs an additional unordered
                 store buffer for fast-tracking thread-local stores and
                 allowing later memory accesses to proceed without a
                 memory ordering related stall. Our experimental study
                 shows that the cost of guaranteeing end-to-end SC is
                 only 6.2\% on average when compared to a system with
                 TSO hardware executing a stock compiler's output.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Suito:2012:DRM,
  author =       "Kazutoshi Suito and Rikuhei Ueda and Kei Fujii and
                 Takuma Kogo and Hiroki Matsutani and Nobuyuki
                 Yamasaki",
  title =        "The Dependable Responsive Multithreaded Processor for
                 Distributed Real-Time Systems",
  journal =      j-IEEE-MICRO,
  volume =       "32",
  number =       "6",
  pages =        "52--61",
  month =        nov # "\slash " # dec,
  year =         "2012",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2012.88",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Dec 13 15:52:22 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Terechko:2012:BPS,
  author =       "Andrei Terechko and Jan Hoogerbrugge and Ghiath Alkadi
                 and Surendra Guntur and Anirban Lahiri and Marc
                 Duranton and Clemens W{\"u}st and Phillip Christie and
                 Axel Nackaerts and Aatish Kumar",
  title =        "Balancing Programmability and Silicon Efficiency of
                 Heterogeneous Multicore Architectures",
  journal =      j-TECS,
  volume =       "11S",
  number =       "1",
  pages =        "14:1--14:??",
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180887.2180890",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jun 7 16:18:52 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Multicore architectures provide scalable performance
                 with a lower hardware design effort than single core
                 processors. Our article presents a design methodology
                 and an embedded multicore architecture, focusing on
                 reducing the software design complexity and boosting
                 the performance density. First, we analyze
                 characteristics of the Task-Level Parallelism in modern
                 multimedia workloads. These characteristics are used to
                 formulate requirements for the programming model. Then
                 we translate the programming model requirements to an
                 architecture specification, including a novel
                 low-complexity implementation of cache coherence and a
                 hardware synchronization unit. Our evaluation
                 demonstrates that the novel coherence mechanism
                 substantially simplifies hardware design, while
                 reducing the performance by less than 18\% relative to
                 a complex snooping technique. Compared to a single
                 processor core, the multicores have already proven to
                 be more area- and energy-efficient. However, the
                 multicore architectures in embedded systems still
                 compete with highly efficient function-specific
                 hardware accelerators. In this article we identify five
                 architectural methods to boost performance density of
                 multicores; microarchitectural downscaling, asymmetric
                 multicore architectures, multithreading, generic
                 accelerators, and conjoining. Then, we present a novel
                 methodology to explore multicore design spaces,
                 including the architectural methods improving the
                 performance density. The methodology is based on a
                 complex formula computing performances of heterogeneous
                 multicore systems. Using this design space exploration
                 methodology for HD and QuadHD H.264 video decoding, we
                 estimate that the required areas of multicores in CMOS
                 45 nm are 2.5 mm$^2$ and 8.6 mm$^2$, respectively.
                 These results suggest that heterogeneous multicores are
                 cost-effective for embedded applications and can
                 provide a good programmability support.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
}

@Article{Tumeo:2012:DNG,
  author =       "Antonino Tumeo and Simone Secchi and Oreste Villa",
  title =        "Designing Next-Generation Massively Multithreaded
                 Architectures for Irregular Applications",
  journal =      j-COMPUTER,
  volume =       "45",
  number =       "8",
  pages =        "53--61",
  month =        aug,
  year =         "2012",
  CODEN =        "CPTRB4",
  DOI =          "https://doi.org/10.1109/MC.2012.193",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Wed Aug 29 16:38:07 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
}

@Article{Villa:2012:FAS,
  author =       "Oreste Villa and Antonino Tumeo and Simone Secchi and
                 Joseph B. Manzano",
  title =        "Fast and Accurate Simulation of the {Cray XMT}
                 Multithreaded Supercomputer",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "23",
  number =       "12",
  pages =        "2266--2279",
  month =        dec,
  year =         "2012",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2012.70",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Nov 15 06:27:40 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/super.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Vitali:2012:LSO,
  author =       "Roberto Vitali and Alessandro Pellegrini and Francesco
                 Quaglia",
  title =        "Load sharing for optimistic parallel simulations on
                 multi core machines",
  journal =      j-SIGMETRICS,
  volume =       "40",
  number =       "3",
  pages =        "2--11",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2425248.2425250",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Sun May 5 09:58:20 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "Parallel Discrete Event Simulation (PDES) is based on
                 the partitioning of the simulation model into distinct
                 Logical Processes (LPs), each one modeling a portion of
                 the entire system, which are allowed to execute
                 simulation events concurrently. This allows exploiting
                 parallel computing architectures to speedup model
                 execution, and to make very large models tractable. In
                 this article we cope with the optimistic approach to
                 PDES, where LPs are allowed to concurrently process
                 their events in a speculative fashion, and rollback/
                 recovery techniques are used to guarantee state
                 consistency in case of causality violations along the
                 speculative execution path. Particularly, we present an
                 innovative load sharing approach targeted at optimizing
                 resource usage for fruitful simulation work when
                 running an optimistic PDES environment on top of
                 multi-processor/multi-core machines. Beyond providing
                 the load sharing model, we also define a load sharing
                 oriented architectural scheme, based on a symmetric
                 multi-threaded organization of the simulation platform.
                 Finally, we present a real implementation of the load
                 sharing architecture within the open source ROme
                 OpTimistic Simulator (ROOT-Sim) package. Experimental
                 data for an assessment of both viability and
                 effectiveness of our proposal are presented as well.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Volos:2012:ATM,
  author =       "Haris Volos and Andres Jaan Tack and Michael M. Swift
                 and Shan Lu",
  title =        "Applying transactional memory to concurrency bugs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "211--222",
  month =        mar,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2189750.2150999",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Multithreaded programs often suffer from
                 synchronization bugs such as atomicity violations and
                 deadlocks. These bugs arise from complicated locking
                 strategies and ad hoc synchronization methods to avoid
                 the use of locks. A survey of the bug databases of
                 major open-source applications shows that concurrency
                 bugs often take multiple fix attempts, and that fixes
                 often introduce yet more concurrency bugs.
                 Transactional memory (TM) enables programmers to
                 declare regions of code atomic without specifying a
                 lock and has the potential to avoid these bugs. Where
                 most previous studies have focused on using TM to write
                 new programs from scratch, we consider its utility in
                 fixing existing programs with concurrency bugs. We
                 therefore investigate four methods of using TM on three
                 concurrent programs. Overall, we find that 29\% of the
                 bugs are not fixable by transactional memory, showing
                 that TM does not address many important types of
                 concurrency bugs. In particular, TM works poorly with
                 extremely long critical sections and with deadlocks
                 involving both condition variables and I/O. Conversely,
                 we find that for 56\% of the bugs, transactional memory
                 offers demonstrable value by simplifying the reasoning
                 behind a fix or the effort to implement a fix, and
                 using transactions in the first place would have
                 avoided 71\% of the bugs examined. We also find that ad
                 hoc synchronization put in place to avoid the overhead
                 of locking can be greatly simplified with TM, but
                 requires hardware support to perform well.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Wei:2012:OLL,
  author =       "Zheng Wei and Joseph Jaja",
  title =        "Optimization of Linked List Prefix Computations on
                 Multithreaded {GPUs} Using {CUDA}",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "22",
  number =       "4",
  pages =        "1250012",
  month =        dec,
  year =         "2012",
  CODEN =        "PPLTEE",
  DOI =          "https://doi.org/10.1142/S0129626412500120",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Sat Jun 22 15:54:17 MDT 2013",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Wu:2012:SPA,
  author =       "Jingyue Wu and Yang Tang and Gang Hu and Heming Cui
                 and Junfeng Yang",
  title =        "Sound and precise analysis of parallel programs
                 through schedule specialization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "205--216",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254090",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Parallel programs are known to be difficult to
                 analyze. A key reason is that they typically have an
                 enormous number of execution interleavings, or
                 schedules. Static analysis over all schedules requires
                 over-approximations, resulting in poor precision;
                 dynamic analysis rarely covers more than a tiny
                 fraction of all schedules. We propose an approach
                 called schedule specialization to analyze a parallel
                 program over only a small set of schedules for
                 precision, and then enforce these schedules at runtime
                 for soundness of the static analysis results. We build
                 a schedule specialization framework for C/C++
                 multithreaded programs that use Pthreads. Our framework
                 avoids the need to modify every analysis to be
                 schedule-aware by specializing a program into a simpler
                 program based on a schedule, so that the resultant
                 program can be analyzed with stock analyses for
                 improved precision. Moreover, our framework provides a
                 precise schedule-aware def-use analysis on memory
                 locations, enabling us to build three highly precise
                 analyses: an alias analyzer, a data-race detector, and
                 a path slicer. Evaluation on 17 programs, including 2
                 real-world programs and 15 popular benchmarks, shows
                 that analyses using our framework reduced may-aliases
                 by 61.9\%, false race reports by 69\%, and path slices
                 by 48.7\%; and detected 7 unknown bugs in well-checked
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Xekalakis:2012:MSM,
  author =       "Polychronis Xekalakis and Nikolas Ioannou and Marcelo
                 Cintra",
  title =        "Mixed speculative multithreaded execution models",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2355585.2355591",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The current trend toward multicore architectures has
                 placed great pressure on programmers and compilers to
                 generate thread-parallel programs. Improved execution
                 performance can no longer be obtained via traditional
                 single-thread instruction level parallelism (ILP), but,
                 instead, via multithreaded execution. One notable
                 technique that facilitates the extraction of parallel
                 threads from sequential applications is thread-level
                 speculation (TLS). This technique allows
                 programmers/compilers to generate threads without
                 checking for inter-thread data and control dependences,
                 which are then transparently enforced by the hardware.
                 Most prior work on TLS has concentrated on thread
                 selection and mechanisms to efficiently support the
                 main TLS operations, such as squashes, data versioning,
                 and commits. This article seeks to enhance TLS
                 functionality by combining it with other speculative
                 multithreaded execution models. The main idea is that
                 TLS already requires extensive hardware support, which
                 when slightly augmented can accommodate other
                 speculative multithreaded techniques. Recognizing that
                 for different applications, or even program phases, the
                 application bottlenecks may be different, it is
                 reasonable to assume that the more versatile a system
                 is, the more efficiently it will be able to execute the
                 given program. Toward this direction, we first show
                 that mixed execution models that combine TLS with
                 Helper Threads (HT), RunAhead execution (RA) and
                 MultiPath execution (MP) perform better than any of the
                 models alone. Based on a simple model that we propose,
                 we show that benefits come from being able to extract
                 additional ILP without harming the TLP extracted by
                 TLS. We then show that by combining all the execution
                 models in a unified one that combines all these
                 speculative multithreaded models, ILP can be further
                 enhanced with only minimal additional cost in
                 hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xue:2012:RJC,
  author =       "Jingling Xue",
  title =        "Rethinking {Java} call stack design for tiny embedded
                 devices",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "1--10",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248420",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "The ability of tiny embedded devices to run large
                 feature-rich programs is typically constrained by the
                 amount of memory installed on such devices.
                 Furthermore, the useful operation of these devices in
                 wireless sensor applications is limited by their
                 battery life. This paper presents a call stack redesign
                 targeted at an efficient use of RAM storage and CPU
                 cycles by a Java program running on a wireless sensor
                 mote. Without compromising the application programs,
                 our call stack redesign saves 30\% of RAM, on average,
                 evaluated over a large number of benchmarks. On the
                 same set of bench-marks, our design also avoids
                 frequent RAM allocations and deallocations, resulting
                 in average 80\% fewer memory operations and 23\% faster
                 program execution. These may be critical improvements
                 for tiny embedded devices that are equipped with small
                 amount of RAM and limited battery life. However, our
                 call stack redesign is equally effective for any
                 complex multi-threaded object oriented program
                 developed for desktop computers. We describe the
                 redesign, measure its performance and report the
                 resulting savings in RAM and execution time for a wide
                 variety of programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Yamashita:2012:APS,
  author =       "Makoto Yamashita and Katsuki Fujisawa and Mituhiro
                 Fukuda and Kazuhide Nakata and Maho Nakata",
  title =        "{Algorithm 925}: Parallel Solver for Semidefinite
                 Programming Problem having Sparse {Schur} Complement
                 Matrix",
  journal =      j-TOMS,
  volume =       "39",
  number =       "1",
  pages =        "6:1--6:22",
  month =        nov,
  year =         "2012",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2382585.2382591",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Dec 6 07:36:30 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "A SemiDefinite Programming (SDP) problem is one of the
                 most central problems in mathematical optimization. SDP
                 provides an effective computation framework for many
                 research fields. Some applications, however, require
                 solving a large-scale SDP whose size exceeds the
                 capacity of a single processor both in terms of
                 computation time and available memory. SDPARA
                 (SemiDefinite Programming Algorithm paRAllel package)
                 [Yamashita et al. 2003b] was designed to solve such
                 large-scale SDPs. Its parallel performance is
                 outstanding for general SDPs in most cases. However,
                 the parallel implementation is less successful for some
                 sparse SDPs obtained from applications such as
                 Polynomial Optimization Problems (POPs) or Sensor
                 Network Localization (SNL) problems, since this version
                 of SDPARA cannot directly handle sparse Schur
                 Complement Matrices (SCMs). In this article we improve
                 SDPARA by focusing on the sparsity of the SCM and we
                 propose a new parallel implementation using the
                 formula-cost-based distribution along with a
                 replacement of the dense Cholesky factorization. We
                 verify numerically that these features are key to
                 solving SDPs with sparse SCMs more quickly on parallel
                 computing systems. The performance is further enhanced
                 by multithreading and the new SDPARA attains
                 considerable scalability in general. It also finds
                 solutions for extremely large-scale SDPs arising from
                 POPs which cannot be obtained by other solvers.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Yu:2012:MCD,
  author =       "Jie Yu and Satish Narayanasamy and Cristiano Pereira
                 and Gilles Pokam",
  title =        "{Maple}: a coverage-driven testing tool for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "485--502",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384651",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Testing multithreaded programs is a hard problem,
                 because it is challenging to expose those rare
                 interleavings that can trigger a concurrency bug. We
                 propose a new thread interleaving coverage-driven
                 testing tool called Maple that seeks to expose untested
                 thread interleavings as much as possible. It memoizes
                 tested interleavings and actively seeks to expose
                 untested interleavings for a given test input to
                 increase interleaving coverage. We discuss several
                 solutions to realize the above goal. First, we discuss
                 a coverage metric based on a set of interleaving
                 idioms. Second, we discuss an online technique to
                 predict untested interleavings that can potentially be
                 exposed for a given test input. Finally, the predicted
                 untested interleavings are exposed by actively
                 controlling the thread schedule while executing for the
                 test input. We discuss our experiences in using the
                 tool to expose several known and unknown bugs in
                 real-world applications such as Apache and MySQL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Zhang:2012:SCC,
  author =       "Eddy Zheng Zhang and Yunlian Jiang and Xipeng Shen",
  title =        "The Significance of {CMP} Cache Sharing on
                 Contemporary Multithreaded Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "23",
  number =       "2",
  pages =        "367--374",
  month =        feb,
  year =         "2012",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2011.130",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Mar 01 14:47:13 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Zhuravlev:2012:SST,
  author =       "Sergey Zhuravlev and Juan Carlos Saez and Sergey
                 Blagodurov and Alexandra Fedorova and Manuel Prieto",
  title =        "Survey of scheduling techniques for addressing shared
                 resources in multicore processors",
  journal =      j-COMP-SURV,
  volume =       "45",
  number =       "1",
  pages =        "4:1--4:??",
  month =        nov,
  year =         "2012",
  CODEN =        "CMSVAN",
  DOI =          "https://doi.org/10.1145/2379776.2379780",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Thu Dec 6 10:55:59 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/surveys/;
                 https://www.math.utah.edu/pub/tex/bib/compsurv.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Chip multicore processors (CMPs) have emerged as the
                 dominant architecture choice for modern computing
                 platforms and will most likely continue to be dominant
                 well into the foreseeable future. As with any system,
                 CMPs offer a unique set of challenges. Chief among them
                 is the shared resource contention that results because
                 CMP cores are not independent processors but rather
                 share common resources among cores such as the last
                 level cache (LLC). Shared resource contention can lead
                 to severe and unpredictable performance impact on the
                 threads running on the CMP. Conversely, CMPs offer
                 tremendous opportunities for multithreaded
                 applications, which can take advantage of simultaneous
                 thread execution as well as fast inter thread data
                 sharing. Many solutions have been proposed to deal with
                 the negative aspects of CMPs and take advantage of the
                 positive. This survey focuses on the subset of these
                 solutions that exclusively make use of OS thread-level
                 scheduling to achieve their goals. These solutions are
                 particularly attractive as they require no changes to
                 hardware and minimal or no changes to the OS. The OS
                 scheduler has expanded well beyond its original role of
                 time-multiplexing threads on a single core into a
                 complex and effective resource manager. This article
                 surveys a multitude of new and exciting work that
                 explores the diverse new roles the OS scheduler can
                 successfully take on.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
}

@Article{Beckert:2013:DLD,
  author =       "Bernhard Beckert and Vladimir Klebanov",
  title =        "A {Dynamic Logic} for deductive verification of
                 multi-threaded programs",
  journal =      j-FORM-ASP-COMPUT,
  volume =       "25",
  number =       "3",
  pages =        "405--437",
  month =        may,
  year =         "2013",
  CODEN =        "FACME5",
  DOI =          "https://doi.org/10.1007/s00165-012-0261-4",
  ISSN =         "0934-5043 (print), 1433-299X (electronic)",
  ISSN-L =       "0934-5043",
  bibdate =      "Wed Mar 18 05:35:14 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/formaspcomput.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s00165-012-0261-4",
  acknowledgement = ack-nhfb,
  fjournal =     "Formal Aspects of Computing",
  journal-URL =  "http://link.springer.com/journal/165",
}

@Article{Bergan:2013:ICS,
  author =       "Tom Bergan and Luis Ceze and Dan Grossman",
  title =        "Input-covering schedules for multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "677--692",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We propose constraining multithreaded execution to
                 small sets of input-covering schedules, which we define
                 as follows: given a program $P$, we say that a set of
                 schedules $ \Sigma $ covers all inputs of program $P$
                 if, when given any input, $P$'s execution can be
                 constrained to some schedule in $ \Sigma $ and still
                 produce a semantically valid result. Our approach is to
                 first compute a small $ \Sigma $ for a given program
                 $P$, and then, at runtime, constrain $P$'s execution to
                 always follow some schedule in $ \Sigma $, and never
                 deviate. We have designed an algorithm that uses
                 symbolic execution to systematically enumerate a set of
                 input-covering schedules, $ \Sigma $. To deal with
                 programs that run for an unbounded length of time, we
                 partition execution into bounded epochs, find
                 input-covering schedules for each epoch in isolation,
                 and then piece the schedules together at runtime. We
                 have implemented this algorithm along with a
                 constrained execution runtime for pthreads programs,
                 and we report results Our approach has the following
                 advantage: because all possible runtime schedules are
                 known a priori, we can seek to validate the program by
                 thoroughly verifying each schedule in $ \Sigma $, in
                 isolation, without needing to reason about the huge
                 space of thread interleavings that arises due to
                 conventional nondeterministic execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Bois:2013:BGV,
  author =       "Kristof {Du Bois} and Jennifer B. Sartor and Stijn
                 Eyerman and Lieven Eeckhout",
  title =        "Bottle graphs: visualizing scalability bottlenecks in
                 multi-threaded applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "355--372",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Understanding and analyzing multi-threaded program
                 performance and scalability is far from trivial, which
                 severely complicates parallel software development and
                 optimization. In this paper, we present bottle graphs,
                 a powerful analysis tool that visualizes multi-threaded
                 program performance, in regards to both per-thread
                 parallelism and execution time. Each thread is
                 represented as a box, with its height equal to the
                 share of that thread in the total program execution
                 time, its width equal to its parallelism, and its area
                 equal to its total running time. The boxes of all
                 threads are stacked upon each other, leading to a stack
                 with height equal to the total program execution time.
                 Bottle graphs show exactly how scalable each thread is,
                 and thus guide optimization towards those threads that
                 have a smaller parallel component (narrower), and a
                 larger share of the total execution time (taller), i.e.
                 to the 'neck' of the bottle. Using light-weight OS
                 modules, we calculate bottle graphs for unmodified
                 multi-threaded programs running on real processors with
                 an average overhead of 0.68\%. To demonstrate their
                 utility, we do an extensive analysis of 12 Java
                 benchmarks running on top of the Jikes JVM, which
                 introduces many JVM service threads. We not only reveal
                 and explain scalability limitations of several
                 well-known Java benchmarks; we also analyze the reasons
                 why the garbage collector itself does not scale, and in
                 fact performs optimally with two collector threads for
                 all benchmarks, regardless of the number of application
                 threads. Finally, we compare the scalability of Jikes
                 versus the OpenJDK JVM. We demonstrate how useful and
                 intuitive bottle graphs are as a tool to analyze
                 scalability and help optimize multi-threaded
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Bond:2013:GDG,
  author =       "Michael Bond",
  title =        "{GPUDet}: a deterministic {GPU} architecture",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "1--12",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451118",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nondeterminism is a key challenge in developing
                 multithreaded applications. Even with the same input,
                 each execution of a multithreaded program may produce a
                 different output. This behavior complicates debugging
                 and limits one's ability to test for correctness. This
                 non-reproducibility situation is aggravated on
                 massively parallel architectures like graphics
                 processing units (GPUs) with thousands of concurrent
                 threads. We believe providing a deterministic
                 environment to ease debugging and testing of GPU
                 applications is essential to enable a broader class of
                 software to use GPUs. Many hardware and software
                 techniques have been proposed for providing determinism
                 on general-purpose multi-core processors. However,
                 these techniques are designed for small numbers of
                 threads. Scaling them to thousands of threads on a GPU
                 is a major challenge. This paper proposes a scalable
                 hardware mechanism, GPUDet, to provide determinism in
                 GPU architectures. In this paper we characterize the
                 existing deterministic and nondeterministic aspects of
                 current GPU execution models, and we use these
                 observations to inform GPUDet's design. For example,
                 GPUDet leverages the inherent determinism of the SIMD
                 hardware in GPUs to provide determinism within a
                 wavefront at no cost. GPUDet also exploits the Z-Buffer
                 Unit, an existing GPU hardware unit for graphics
                 rendering, to allow parallel out-of-order memory writes
                 to produce a deterministic output. Other optimizations
                 in GPUDet include deterministic parallel execution of
                 atomic operations and a workgroup-aware algorithm that
                 eliminates unnecessary global synchronizations. Our
                 simulation results indicate that GPUDet incurs only 2X
                 slowdown on average over a baseline nondeterministic
                 architecture, with runtime overheads as low as 4\% for
                 compute-bound applications, despite running GPU kernels
                 with thousands of threads. We also characterize the
                 sources of overhead for deterministic execution on GPUs
                 to provide insights for further optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Bond:2013:OCC,
  author =       "Michael D. Bond and Milind Kulkarni and Man Cao and
                 Minjia Zhang and Meisam Fathi Salmi and Swarnendu
                 Biswas and Aritra Sengupta and Jipeng Huang",
  title =        "{OCTET}: capturing and controlling cross-thread
                 dependences efficiently",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "693--712",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Parallel programming is essential for reaping the
                 benefits of parallel hardware, but it is notoriously
                 difficult to develop and debug reliable, scalable
                 software systems. One key challenge is that modern
                 languages and systems provide poor support for ensuring
                 concurrency correctness properties --- atomicity,
                 sequential consistency, and multithreaded determinism
                 --- because all existing approaches are impractical.
                 Dynamic, software-based approaches slow programs by up
                 to an order of magnitude because capturing and
                 controlling cross-thread dependences (i.e., conflicting
                 accesses to shared memory) requires synchronization at
                 virtually every access to potentially shared memory.
                 This paper introduces a new software-based concurrency
                 control mechanism called OCTET that soundly captures
                 cross-thread dependences and can be used to build
                 dynamic analyses for concurrency correctness. OCTET
                 achieves low overheads by tracking the locality state
                 of each potentially shared object. Non-conflicting
                 accesses conform to the locality state and require no
                 synchronization; only conflicting accesses require a
                 state change and heavyweight synchronization. This
                 optimistic tradeoff leads to significant efficiency
                 gains in capturing cross-thread dependences: a
                 prototype implementation of OCTET in a high-performance
                 Java virtual machine slows real-world concurrent
                 programs by only 26\% on average. A dependence
                 recorder, suitable for record {\&} replay, built on top
                 of OCTET adds an additional 5\% overhead on average.
                 These results suggest that OCTET can provide a
                 foundation for developing low-overhead analyses that
                 check and enforce concurrency correctness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Bouajjani:2013:ARP,
  author =       "Ahmed Bouajjani and Michael Emmi",
  title =        "Analysis of Recursively Parallel Programs",
  journal =      j-TOPLAS,
  volume =       "35",
  number =       "3",
  pages =        "10:1--10:??",
  month =        nov,
  year =         "2013",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2518188",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Nov 8 17:09:04 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "We propose a general formal model of isolated
                 hierarchical parallel computations, and identify
                 several fragments to match the concurrency constructs
                 present in real-world programming languages such as
                 Cilk and X10. By associating fundamental formal models
                 (vector addition systems with recursive transitions) to
                 each fragment, we provide a common platform for
                 exposing the relative difficulties of algorithmic
                 reasoning. For each case we measure the complexity of
                 deciding state reachability for finite-data recursive
                 programs, and propose algorithms for the decidable
                 cases. The complexities which include PTIME, NP,
                 EXPSPACE, and 2EXPTIME contrast with undecidable state
                 reachability for recursive multithreaded programs.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Buttari:2013:FGM,
  author =       "Alfredo Buttari",
  title =        "Fine-Grained Multithreading for the Multifrontal {$ Q
                 R $} Factorization of Sparse Matrices",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "35",
  number =       "4",
  pages =        "C323--C345",
  month =        "????",
  year =         "2013",
  CODEN =        "SJOCE3",
  DOI =          "https://doi.org/10.1137/110846427",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  bibdate =      "Fri Jul 19 07:44:01 MDT 2013",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SISC/35/4;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/siamjscicomput.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Scientific Computing",
  journal-URL =  "http://epubs.siam.org/sisc",
  onlinedate =   "January 2013",
}

@Article{Cabodi:2013:TBM,
  author =       "Gianpiero Cabodi and Sergio Nocco and Stefano Quer",
  title =        "Thread-based multi-engine model checking for multicore
                 platforms",
  journal =      j-TODAES,
  volume =       "18",
  number =       "3",
  pages =        "36:1--36:??",
  month =        jul,
  year =         "2013",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/2491477.2491480",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Sat Jul 27 08:09:07 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/todaes.bib",
  abstract =     "This article describes a multithreaded,
                 portfolio-based approach to model checking, where
                 multiple cores are exploited as the underlying
                 computing framework to support concurrent execution of
                 cooperative engines. We introduce a portfolio-based
                 approach to model checking. Our portfolio is first
                 driven by an approximate runtime predictor that
                 provides a heuristic approximation to a perfect oracle
                 and suggests which engines are more suitable for each
                 verification instance. Scalability and robustness of
                 the overall model-checking effort highly rely on a
                 concurrent, multithreaded model of execution. Following
                 similar approaches in related application fields, we
                 dovetail data partitioning, focused on proving several
                 properties in parallel, and engine partitioning, based
                 on concurrent runs of different model-checking engines
                 competing for completion of the same problem. We
                 investigate concurrency not only to effectively exploit
                 several available engines, which operate independently,
                 but also to show that a cooperative effort is possible.
                 In this case, we adopt a straightforward, light-weight,
                 model of inter-engine communication and data sharing.
                 We provide a detailed description of the ideas,
                 algorithms, and experimental results obtained on the
                 benchmarks from the Hardware Model Checking Competition
                 suites (HWMCC'10 and HWMCC'11).",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Cai:2013:TST,
  author =       "Yan Cai and Ke Zhai and Shangru Wu and W. K. Chan",
  title =        "{TeamWork}: synchronizing threads globally to detect
                 real deadlocks for multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "311--312",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442560",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "This paper presents the aim of TeamWork, our ongoing
                 effort to develop a comprehensive dynamic deadlock
                 confirmation tool for multithreaded programs. It also
                 presents a refined object abstraction algorithm that
                 refines the existing stack hash abstraction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Cain:2013:RAS,
  author =       "Harold W. Cain and Maged M. Michael and Brad Frey and
                 Cathy May and Derek Williams and Hung Le",
  title =        "Robust architectural support for transactional memory
                 in the {Power} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "225--236",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485942",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "On the twentieth anniversary of the original
                 publication [10], following ten years of intense
                 activity in the research literature, hardware support
                 for transactional memory (TM) has finally become a
                 commercial reality, with HTM-enabled chips currently or
                 soon-to-be available from many hardware vendors. In
                 this paper we describe architectural support for TM
                 added to a future version of the Power ISA{\TM}. Two
                 imperatives drove the development: the desire to
                 complement our weakly-consistent memory model with a
                 more friendly interface to simplify the development and
                 porting of multithreaded applications, and the need for
                 robustness beyond that of some early implementations.
                 In the process of commercializing the feature, we had
                 to resolve some previously unexplored interactions
                 between TM and existing features of the ISA, for
                 example translation shootdown, interrupt handling,
                 atomic read-modify-write primitives, and our weakly
                 consistent memory model. We describe these
                 interactions, the overall architecture, and discuss the
                 motivation and rationale for our choices of
                 architectural semantics, beyond what is typically found
                 in reference manuals.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Chung:2013:LBD,
  author =       "Eric S. Chung and John D. Davis and Jaewon Lee",
  title =        "{LINQits}: big data on little clients",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "261--272",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485945",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "We present LINQits, a flexible hardware template that
                 can be mapped onto programmable logic or ASICs in a
                 heterogeneous system-on-chip for a mobile device or
                 server. Unlike fixed-function accelerators, LINQits
                 accelerates a domain-specific query language called
                 LINQ. LINQits does not provide coverage for all
                 possible applications --- however, existing
                 applications (re-)written with LINQ in mind benefit
                 extensively from hardware acceleration. Furthermore,
                 the LINQits framework offers a graceful and transparent
                 migration path from software to hardware. LINQits is
                 prototyped on a 2W heterogeneous SoC called the ZYNQ
                 processor, which combines dual ARM A9 processors with
                 an FPGA on a single die in 28nm silicon technology. Our
                 physical measurements show that LINQits improves energy
                 efficiency by 8.9 to 30.6 times and performance by 10.7
                 to 38.1 times compared to optimized, multithreaded C
                 programs running on conventional ARM A9 processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Demange:2013:PBB,
  author =       "Delphine Demange and Vincent Laporte and Lei Zhao and
                 Suresh Jagannathan and David Pichardie and Jan Vitek",
  title =        "{Plan B}: a buffered memory model for {Java}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "329--342",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429110",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent advances in verification have made it possible
                 to envision trusted implementations of real-world
                 languages. Java with its type-safety and fully
                 specified semantics would appear to be an ideal
                 candidate; yet, the complexity of the translation steps
                 used in production virtual machines have made it a
                 challenging target for verifying compiler technology.
                 One of Java's key innovations, its memory model, poses
                 significant obstacles to such an endeavor. The Java
                 Memory Model is an ambitious attempt at specifying the
                 behavior of multithreaded programs in a portable,
                 hardware agnostic, way. While experts have an intuitive
                 grasp of the properties that the model should enjoy,
                 the specification is complex and not well-suited for
                 integration within a verifying compiler infrastructure.
                 Moreover, the specification is given in an axiomatic
                 style that is distant from the intuitive
                 reordering-based reasonings traditionally used to
                 justify or rule out behaviors, and ill suited to the
                 kind of operational reasoning one would expect to
                 employ in a compiler. This paper takes a step back, and
                 introduces a Buffered Memory Model (BMM) for Java. We
                 choose a pragmatic point in the design space
                 sacrificing generality in favor of a model that is
                 fully characterized in terms of the reorderings it
                 allows, amenable to formal reasoning, and which can be
                 efficiently applied to a specific hardware family,
                 namely x86 multiprocessors. Although the BMM restricts
                 the reorderings compilers are allowed to perform, it
                 serves as the key enabling device to achieving a
                 verification pathway from bytecode to machine
                 instructions. Despite its restrictions, we show that it
                 is backwards compatible with the Java Memory Model and
                 that it does not cripple performance on TSO
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '13 conference proceedings.",
}

@Article{DuBois:2013:CSI,
  author =       "Kristof {Du Bois} and Stijn Eyerman and Jennifer B.
                 Sartor and Lieven Eeckhout",
  title =        "Criticality stacks: identifying critical threads in
                 parallel programs using synchronization behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "511--522",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485966",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Analyzing multi-threaded programs is quite
                 challenging, but is necessary to obtain good multicore
                 performance while saving energy. Due to
                 synchronization, certain threads make others wait,
                 because they hold a lock or have yet to reach a
                 barrier. We call these critical threads, i.e., threads
                 whose performance is determinative of program
                 performance as a whole. Identifying these threads can
                 reveal numerous optimization opportunities, for the
                 software developer and for hardware. In this paper, we
                 propose a new metric for assessing thread criticality,
                 which combines both how much time a thread is
                 performing useful work and how many co-running threads
                 are waiting. We show how thread criticality can be
                 calculated online with modest hardware additions and
                 with low overhead. We use our metric to create
                 criticality stacks that break total execution time into
                 each thread's criticality component, allowing for easy
                 visual analysis of parallel imbalance. To validate our
                 criticality metric, and demonstrate it is better than
                 previous metrics, we scale the frequency of the most
                 critical thread and show it achieves the largest
                 performance improvement. We then demonstrate the broad
                 applicability of criticality stacks by using them to
                 perform three types of optimizations: (1) program
                 analysis to remove parallel bottlenecks, (2)
                 dynamically identifying the most critical thread and
                 accelerating it using frequency scaling to improve
                 performance, and (3) showing that accelerating only the
                 most critical thread allows for targeted energy
                 reduction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Ediger:2013:GMA,
  author =       "David Ediger and Karl Jiang and E. Jason Riedy and
                 David A. Bader",
  title =        "{GraphCT}: Multithreaded Algorithms for Massive Graph
                 Analysis",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "24",
  number =       "11",
  pages =        "2220--2229",
  month =        nov,
  year =         "2013",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2012.323",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Nov 15 10:31:20 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Ferrara:2013:GSA,
  author =       "P. Ferrara",
  title =        "A generic static analyzer for multithreaded {Java}
                 programs",
  journal =      j-SPE,
  volume =       "43",
  number =       "6",
  pages =        "663--684",
  month =        jun,
  year =         "2013",
  CODEN =        "SPEXBL",
  DOI =          "https://doi.org/10.1002/spe.2126",
  ISSN =         "0038-0644 (print), 1097-024X (electronic)",
  ISSN-L =       "0038-0644",
  bibdate =      "Tue Dec 3 10:30:05 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/spe.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  acknowledgement = ack-nhfb,
  fjournal =     "Software --- Practice and Experience",
  journal-URL =  "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
  onlinedate =   "9 May 2012",
}

@Article{Honarmand:2013:CUA,
  author =       "Nima Honarmand and Nathan Dautenhahn and Josep
                 Torrellas and Samuel T. King and Gilles Pokam and
                 Cristiano Pereira",
  title =        "{Cyrus}: unintrusive application-level record-replay
                 for replay parallelism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "193--206",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451138",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Architectures for deterministic record-replay (R\&R)
                 of multithreaded code are attractive for program
                 debugging, intrusion analysis, and fault-tolerance
                 uses. However, very few of the proposed designs have
                 focused on maximizing replay speed --- a key enabling
                 property of these systems. The few efforts that focus
                 on replay speed require intrusive hardware or software
                 modifications, or target whole-system R\&R rather than
                 the more useful application-level R\&R. This paper
                 presents the first hardware-based scheme for
                 unintrusive, application-level R\&R that explicitly
                 targets high replay speed. Our scheme, called Cyrus,
                 requires no modification to commodity snoopy cache
                 coherence. It introduces the concept of an on-the-fly
                 software Backend Pass during recording which, as the
                 log is being generated, transforms it for high replay
                 parallelism. This pass also fixes-up the log, and can
                 flexibly trade-off replay parallelism for log size. We
                 analyze the performance of Cyrus using full system (OS
                 plus hardware) simulation. Our results show that Cyrus
                 has negligible recording overhead. In addition, for
                 8-processor runs of SPLASH-2, Cyrus attains an average
                 replay parallelism of 5, and a replay speed that is, on
                 average, only about 50\% lower than the recording
                 speed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Huang:2013:CRL,
  author =       "Jeff Huang and Charles Zhang and Julian Dolby",
  title =        "{CLAP}: recording local executions to reproduce
                 concurrency failures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "141--152",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462167",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present CLAP, a new technique to reproduce
                 concurrency bugs. CLAP has two key steps. First, it
                 logs thread local execution paths at runtime. Second,
                 offline, it computes memory dependencies that accord
                 with the logged execution and are able to reproduce the
                 observed bug. The second step works by combining
                 constraints from the thread paths and constraints based
                 on a memory model, and computing an execution with a
                 constraint solver. CLAP has four major advantages.
                 First, logging purely local execution of each thread is
                 substantially cheaper than logging memory interactions,
                 which enables CLAP to be efficient compared to previous
                 approaches. Second, our logging does not require any
                 synchronization and hence with no added memory barriers
                 or fences; this minimizes perturbation and missed bugs
                 due to extra synchronizations foreclosing certain racy
                 behaviors. Third, since it uses no synchronization, we
                 extend CLAP to work on a range of relaxed memory
                 models, such as TSO and PSO, in addition to sequential
                 consistency. Fourth, CLAP can compute a much simpler
                 execution than the original one, that reveals the bug
                 with minimal thread context switches. To mitigate the
                 scalability issues, we also present an approach to
                 parallelize constraint solving, which theoretically
                 scales our technique to programs with arbitrary
                 execution length. Experimental results on a variety of
                 multithreaded benchmarks and real world concurrent
                 applications validate these advantages by showing that
                 our technique is effective in reproducing concurrency
                 bugs even under relaxed memory models; furthermore, it
                 is significantly more efficient than a state-of-the-art
                 technique that records shared memory dependencies,
                 reducing execution time overhead by 45\% and log size
                 by 88\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Hunt:2013:DTN,
  author =       "Nicholas Hunt and Tom Bergan and Luis Ceze and Steven
                 D. Gribble",
  title =        "{DDOS}: taming nondeterminism in distributed systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "499--508",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451170",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nondeterminism complicates the development and
                 management of distributed systems, and arises from two
                 main sources: the local behavior of each individual
                 node as well as the behavior of the network connecting
                 them. Taming nondeterminism effectively requires
                 dealing with both sources. This paper proposes DDOS, a
                 system that leverages prior work on deterministic
                 multithreading to offer: (1) space-efficient
                 record/replay of distributed systems; and (2) fully
                 deterministic distributed behavior. Leveraging
                 deterministic behavior at each node makes outgoing
                 messages strictly a function of explicit inputs. This
                 allows us to record the system by logging just
                 message's arrival time, not the contents. Going
                 further, we propose and implement an algorithm that
                 makes all communication between nodes deterministic by
                 scheduling communication onto a global logical
                 timeline. We implement both algorithms in a system
                 called DDOS and evaluate our system with parallel
                 scientific applications, an HTTP/memcached system and a
                 distributed microbenchmark with a high volume of
                 peer-to-peer communication. Our results show up to two
                 orders of magnitude reduction in log size of
                 record/replay, and that distributed systems can be made
                 deterministic with an order of magnitude of overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Joao:2013:UBA,
  author =       "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu
                 and Yale N. Patt",
  title =        "Utility-based acceleration of multithreaded
                 applications on asymmetric {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "154--165",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485936",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Asymmetric Chip Multiprocessors (ACMPs) are becoming a
                 reality. ACMPs can speed up parallel applications if
                 they can identify and accelerate code segments that are
                 critical for performance. Proposals already exist for
                 using coarse-grained thread scheduling and fine-grained
                 bottleneck acceleration. Unfortunately, there have been
                 no proposals offered thus far to decide which code
                 segments to accelerate in cases where both
                 coarse-grained thread scheduling and fine-grained
                 bottleneck acceleration could have value. This paper
                 proposes Utility-Based Acceleration of Multithreaded
                 Applications on Asymmetric CMPs (UBA), a cooperative
                 software/hardware mechanism for identifying and
                 accelerating the most likely critical code segments
                 from a set of multithreaded applications running on an
                 ACMP. The key idea is a new Utility of Acceleration
                 metric that quantifies the performance benefit of
                 accelerating a bottleneck or a thread by taking into
                 account both the criticality and the expected speedup.
                 UBA outperforms the best of two state-of-the-art
                 mechanisms by 11\% for single application workloads and
                 by 7\% for two-application workloads on an ACMP with 52
                 small cores and 3 large cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Kambadur:2013:PSP,
  author =       "Melanie Kambadur and Kui Tang and Joshua Lopez and
                 Martha A. Kim",
  title =        "Parallel scaling properties from a basic block view",
  journal =      j-SIGMETRICS,
  volume =       "41",
  number =       "1",
  pages =        "365--366",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2494232.2465748",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Feb 28 06:09:59 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "As software scalability lags behind hardware
                 parallelism, understanding scaling behavior is more
                 important than ever. This paper demonstrates how to use
                 Parallel Block Vector (PBV) profiles to measure the
                 scaling properties of multithreaded programs from a new
                 perspective: the basic block's view. Through this lens,
                 we guide users through quick and simple methods to
                 produce high-resolution application scaling analyses.
                 This method requires no manual program modification,
                 new hardware, or lengthy simulations, and captures the
                 impact of architecture, operating systems, threading
                 models, and inputs. We apply these techniques to a set
                 of parallel benchmarks, and, as an example, demonstrate
                 that when it comes to scaling, functions in an
                 application do not behave monolithically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Kim:2013:DBC,
  author =       "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and
                 Joonwon Lee and Seungryoul Maeng",
  title =        "Demand-based coordinated scheduling for {SMP VMs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "369--380",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451156",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As processor architectures have been enhancing their
                 computing capacity by increasing core counts,
                 independent workloads can be consolidated on a single
                 node for the sake of high resource efficiency in data
                 centers. With the prevalence of virtualization
                 technology, each individual workload can be hosted on a
                 virtual machine for strong isolation between co-located
                 workloads. Along with this trend, hosted applications
                 have increasingly been multithreaded to take advantage
                 of improved hardware parallelism. Although the
                 performance of many multithreaded applications highly
                 depends on communication (or synchronization) latency,
                 existing schemes of virtual machine scheduling do not
                 explicitly coordinate virtual CPUs based on their
                 communication behaviors. This paper presents a
                 demand-based coordinated scheduling scheme for
                 consolidated virtual machines that host multithreaded
                 workloads. To this end, we propose communication-driven
                 scheduling that controls time-sharing in response to
                 inter-processor interrupts (IPIs) between virtual CPUs.
                 On the basis of in-depth analysis on the relationship
                 between IPI communications and coordination demands, we
                 devise IPI-driven coscheduling and delayed preemption
                 schemes, which effectively reduce synchronization
                 latency and unnecessary CPU consumption. In addition,
                 we introduce a load-conscious CPU allocation policy in
                 order to address load imbalance in heterogeneously
                 consolidated environments. The proposed schemes are
                 evaluated with respect to various scenarios of mixed
                 workloads using the PARSEC multithreaded applications.
                 In the evaluation, our scheme improves the overall
                 performance of consolidated workloads, especially
                 communication-intensive applications, by reducing
                 inefficient synchronization latency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{LaFratta:2013:EEM,
  author =       "Patrick A. {La Fratta} and Peter M. Kogge",
  title =        "Energy-efficient multithreading for a hierarchical
                 heterogeneous multicore through locality-cognizant
                 thread generation",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "73",
  number =       "12",
  pages =        "1551--1562",
  month =        dec,
  year =         "2013",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Nov 29 09:55:28 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.sciencedirect.com/science/journal/07437315",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731513001494",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Lobeiras:2013:PSW,
  author =       "Jacobo Lobeiras and Mois{\'e}s Vi{\~n}as and Margarita
                 Amor and Basilio B. Fraguela and Manuel Arenaz and J.
                 A. Garc{\'\i}a and M. J. Castro",
  title =        "Parallelization of shallow water simulations on
                 current multi-threaded systems",
  journal =      j-IJHPCA,
  volume =       "27",
  number =       "4",
  pages =        "493--512",
  month =        nov,
  year =         "2013",
  CODEN =        "IHPCFL",
  DOI =          "https://doi.org/10.1177/1094342012464800",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Mar 14 15:39:57 MDT 2014",
  bibsource =    "http://hpc.sagepub.com/content/27/4.toc;
                 https://www.math.utah.edu/pub/tex/bib/ijsa.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://hpc.sagepub.com/content/27/4/493.full.pdf+html",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of High Performance Computing
                 Applications",
  journal-URL =  "http://hpc.sagepub.com/content/by/year",
  onlinedate =   "December 5, 2012",
}

@Article{Lu:2013:REM,
  author =       "Kai Lu and Xu Zhou and Xiaoping Wang and Wenzhe Zhang
                 and Gen Li",
  title =        "{RaceFree}: an efficient multi-threading model for
                 determinism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "297--298",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Current deterministic systems generally incur large
                 overhead due to the difficulty of detecting and
                 eliminating data races. This paper presents RaceFree, a
                 novel multi-threading runtime that adopts a relaxed
                 deterministic model to provide a data-race-free
                 environment for parallel programs. This model cuts off
                 unnecessary shared-memory communication by isolating
                 threads in separated memories, which eliminates direct
                 data races. Meanwhile, we leverage the happen-before
                 relation defined by applications themselves as one-way
                 communication pipes to perform necessary thread
                 communication. Shared-memory communication is
                 transparently converted to message-passing style
                 communication by our Memory Modification Propagation
                 (MMP) mechanism, which propagates local memory
                 modifications to other threads through the
                 happen-before relation pipes. The overhead of RaceFree
                 is 67.2\% according to our tests on parallel
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Lucia:2013:CEF,
  author =       "Brandon Lucia and Luis Ceze",
  title =        "Cooperative empirical failure avoidance for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "39--50",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451121",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency errors in multithreaded programs are
                 difficult to find and fix. We propose Aviso, a system
                 for avoiding schedule-dependent failures. Aviso
                 monitors events during a program's execution and, when
                 a failure occurs, records a history of events from the
                 failing execution. It uses this history to generate
                 schedule constraints that perturb the order of events
                 in the execution and thereby avoids schedules that lead
                 to failures in future program executions. Aviso
                 leverages scenarios where many instances of the same
                 software run, using a statistical model of program
                 behavior and experimentation to determine which
                 constraints most effectively avoid failures. After
                 implementing Aviso, we showed that it decreased failure
                 rates for a variety of important desktop, server, and
                 cloud applications by orders of magnitude, with an
                 average overhead of less than 20\% and, in some cases,
                 as low as 5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Mahafzah:2013:PAM,
  author =       "Basel A. Mahafzah",
  title =        "Performance assessment of multithreaded quicksort
                 algorithm on simultaneous multithreaded architecture",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "66",
  number =       "1",
  pages =        "339--363",
  month =        oct,
  year =         "2013",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-0910-2",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Feb 8 10:21:52 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=66&issue=1;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-0910-2",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{McCreesh:2013:MTS,
  author =       "Ciaran McCreesh and Patrick Prosser",
  title =        "Multi-Threading a State-of-the-Art Maximum Clique
                 Algorithm",
  journal =      j-ALGORITHMS-BASEL,
  volume =       "6",
  number =       "4",
  pages =        "618--635",
  month =        dec,
  year =         "2013",
  CODEN =        "ALGOCH",
  DOI =          "https://doi.org/10.3390/a6040618",
  ISSN =         "1999-4893 (electronic)",
  ISSN-L =       "1999-4893",
  bibdate =      "Fri May 3 13:50:13 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/algorithms.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.mdpi.com/1999-4893/6/4/618",
  acknowledgement = ack-nhfb,
  fjournal =     "Algorithms (Basel)",
  journal-URL =  "https://www.mdpi.com/journal/algorithms",
  pubdates =     "Received: 15 August 2013 / Revised: 13 September 2013
                 / Accepted: 18 September 2013 / Published: 3 October
                 2013",
}

@Article{Norris:2013:CCC,
  author =       "Brian Norris and Brian Demsky",
  title =        "{CDSChecker}: checking concurrent data structures
                 written with {C\slash C++} atomics",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "131--150",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Writing low-level concurrent software has
                 traditionally required intimate knowledge of the entire
                 toolchain and often has involved coding in assembly.
                 New language standards have extended C and C++ with
                 support for low-level atomic operations and a weak
                 memory model, enabling developers to write portable and
                 efficient multithreaded code. Developing correct
                 low-level concurrent code is well-known to be
                 especially difficult under a weak memory model, where
                 code behavior can be surprising. Building reliable
                 concurrent software using C/C++ low-level atomic
                 operations will likely require tools that help
                 developers discover unexpected program behaviors. In
                 this paper we present CDSChecker, a tool for
                 exhaustively exploring the behaviors of concurrent code
                 under the C/C++ memory model. We develop several novel
                 techniques for modeling the relaxed behaviors allowed
                 by the memory model and for minimizing the number of
                 execution behaviors that CDSChecker must explore. We
                 have used CDSChecker to exhaustively unit test several
                 concurrent data structure implementations on specific
                 inputs and have discovered errors in both a recently
                 published C11 implementation of a work-stealing queue
                 and a single producer, single consumer queue
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Ossner:2013:GMB,
  author =       "Christopher O{\ss}ner and Klemens B{\"o}hm",
  title =        "Graphs for Mining-Based Defect Localization in
                 Multithreaded Programs",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "41",
  number =       "4",
  pages =        "570--593",
  month =        aug,
  year =         "2013",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-012-0237-2",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Jun 22 12:29:22 MDT 2013",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=41&issue=4;
                 https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-012-0237-2",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Parashar:2013:TIC,
  author =       "Angshuman Parashar and Michael Pellauer and Michael
                 Adler and Bushra Ahsan and Neal Crago and Daniel Lustig
                 and Vladimir Pavlov and Antonia Zhai and Mohit Gambhir
                 and Aamer Jaleel and Randy Allmon and Rachid Rayess and
                 Stephen Maresh and Joel Emer",
  title =        "Triggered instructions: a control paradigm for
                 spatially-programmed architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "142--153",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485935",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "In this paper, we present triggered instructions, a
                 novel control paradigm for arrays of processing
                 elements (PEs) aimed at exploiting spatial parallelism.
                 Triggered instructions completely eliminate the program
                 counter and allow programs to transition concisely
                 between states without explicit branch instructions.
                 They also allow efficient reactivity to inter-PE
                 communication traffic. The approach provides a unified
                 mechanism to avoid over-serialized execution,
                 essentially achieving the effect of techniques such as
                 dynamic instruction reordering and multithreading,
                 which each require distinct hardware mechanisms in a
                 traditional sequential architecture. Our analysis shows
                 that a triggered-instruction based spatial accelerator
                 can achieve 8X greater area-normalized performance than
                 a traditional general-purpose processor. Further
                 analysis shows that triggered control reduces the
                 number of static and dynamic instructions in the
                 critical paths by 62\% and 64\% respectively over a
                 program-counter style spatial baseline, resulting in a
                 speedup of 2.0X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Pokam:2013:QPI,
  author =       "Gilles Pokam and Klaus Danne and Cristiano Pereira and
                 Rolf Kassa and Tim Kranich and Shiliang Hu and Justin
                 Gottschlich and Nima Honarmand and Nathan Dautenhahn
                 and Samuel T. King and Josep Torrellas",
  title =        "{QuickRec}: prototyping an {Intel} architecture
                 extension for record and replay of multithreaded
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "643--654",
  month =        jun,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2508148.2485977",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "There has been significant interest in
                 hardware-assisted deterministic Record and Replay (RnR)
                 systems for multithreaded programs on multiprocessors.
                 However, no proposal has implemented this technique in
                 a hardware prototype with full operating system
                 support. Such an implementation is needed to assess RnR
                 practicality. This paper presents QuickRec, the first
                 multicore Intel Architecture (IA) prototype of RnR for
                 multithreaded programs. QuickRec is based on QuickIA,
                 an Intel emulation platform for rapid prototyping of
                 new IA extensions. QuickRec is composed of a Xeon
                 server platform with FPGA-emulated second-generation
                 Pentium cores, and Capo3, a full software stack for
                 managing the recording hardware from within a modified
                 Linux kernel. This paper's focus is understanding and
                 evaluating the implementation issues of RnR on a real
                 platform. Our effort leads to some lessons learned, as
                 well as to some pointers for future research. We
                 demonstrate that RnR can be implemented efficiently on
                 a real multicore IA system. In particular, we show that
                 the rate of memory log generation is insignificant, and
                 that the recording hardware has negligible performance
                 overhead. However, the software stack incurs an average
                 recording overhead of nearly 13\%, which must be
                 reduced to enable always-on use of RnR.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Raychev:2013:ERD,
  author =       "Veselin Raychev and Martin Vechev and Manu Sridharan",
  title =        "Effective race detection for event-driven programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "151--166",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509538",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Like shared-memory multi-threaded programs,
                 event-driven programs such as client-side web
                 applications are susceptible to data races that are
                 hard to reproduce and debug. Race detection for such
                 programs is hampered by their pervasive use of ad hoc
                 synchronization, which can lead to a prohibitive number
                 of false positives. Race detection also faces a
                 scalability challenge, as a large number of
                 short-running event handlers can quickly overwhelm
                 standard vector-clock-based techniques. This paper
                 presents several novel contributions that address both
                 of these challenges. First, we introduce race coverage,
                 a systematic method for exposing ad hoc synchronization
                 and other (potentially harmful) races to the user,
                 significantly reducing false positives. Second, we
                 present an efficient connectivity algorithm for
                 computing race coverage. The algorithm is based on
                 chain decomposition and leverages the structure of
                 event-driven programs to dramatically decrease the
                 overhead of vector clocks. We implemented our
                 techniques in a tool called EventRacer and evaluated it
                 on a number of public web sites. The results indicate
                 substantial performance and precision improvements of
                 our approach over the state-of-the-art. Using
                 EventRacer, we found many harmful races, most of which
                 are beyond the reach of current techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
}

@Article{Saez:2013:DFP,
  author =       "Juan Carlos S{\'a}ez and Fernando Castro and Daniel
                 Chaver and Manuel Prieto",
  title =        "Delivering fairness and priority enforcement on
                 asymmetric multicore systems via {OS} scheduling",
  journal =      j-SIGMETRICS,
  volume =       "41",
  number =       "1",
  pages =        "343--344",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2494232.2465532",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Feb 28 06:09:59 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "Symmetric-ISA (instruction set architecture)
                 asymmetric-performance multicore processors (AMPs) were
                 shown to deliver higher performance per watt and area
                 than symmetric CMPs for applications with diverse
                 architectural requirements. So, it is likely that
                 future multicore processors will combine big
                 power-hungry fast cores and small low-power slow ones.
                 In this paper, we propose a novel thread scheduling
                 algorithm that aims to improve the throughput-fairness
                 trade-off on AMP systems. Our experimental evaluation
                 on real hardware and using scheduler implementations on
                 a general-purpose operating system, reveals that our
                 proposal delivers a better throughput-fairness
                 trade-off than previous schedulers for a wide variety
                 of multi-application workloads including
                 single-threaded and multithreaded applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Sinenian:2013:MMS,
  author =       "Nareg Sinenian and Alex B. Zylstra and Mario J.-E.
                 Manuel and Johan A. Frenje and Atma D. Kanojia and
                 Joshua Stillerman and Richard D. Petrasso",
  title =        "A Multithreaded Modular Software Toolkit for Control
                 of Complex Experiments",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "15",
  number =       "1",
  pages =        "66--75",
  month =        jan # "\slash " # feb,
  year =         "2013",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2012.34",
  ISSN =         "1521-9615",
  ISSN-L =       "1521-9615",
  bibdate =      "Fri Jun 21 08:34:49 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{So:2013:STI,
  author =       "Won So and Alexander G. Dean",
  title =        "Software thread integration for instruction-level
                 parallelism",
  journal =      j-TECS,
  volume =       "13",
  number =       "1",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2512466",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Sep 5 19:03:11 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Multimedia applications require a significantly higher
                 level of performance than previous workloads of
                 embedded systems. They have driven digital signal
                 processor (DSP) makers to adopt high-performance
                 architectures like VLIW (Very-Long Instruction Word).
                 Despite many efforts to exploit instruction-level
                 parallelism (ILP) in the application, the speed is a
                 fraction of what it could be, limited by the difficulty
                 of finding enough independent instructions to keep all
                 of the processor's functional units busy. This article
                 proposes Software Thread Integration (STI) for
                 instruction-level parallelism. STI is a software
                 technique for interleaving multiple threads of control
                 into a single implicitly multithreaded one. We use STI
                 to improve the performance on ILP processors by merging
                 parallel procedures into one, increasing the compiler's
                 scope and hence allowing it to create a more efficient
                 instruction schedule. Assuming the parallel procedures
                 are given, we define a methodology for finding the best
                 performing integrated procedure with a minimum
                 compilation time. We quantitatively estimate the
                 performance impact of integration, allowing various
                 integration scenarios to be compared and ranked via
                 profitability analysis. During integration of threads,
                 different ILP-improving code transformations are
                 selectively applied according to the control structure
                 and the ILP characteristics of the code, driven by
                 interactions with software pipelining. The estimated
                 profitability is verified and corrected by an iterative
                 compilation approach, compensating for possible
                 estimation inaccuracy. Our modeling methods combined
                 with limited compilation quickly find the best
                 integration scenario without requiring exhaustive
                 integration.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J840",
}

@Article{Taft:2013:TPS,
  author =       "S. Tucker Taft",
  title =        "Tutorial: proving safety of parallel \slash
                 multi-threaded programs",
  journal =      j-SIGADA-LETTERS,
  volume =       "33",
  number =       "3",
  pages =        "1--2",
  month =        dec,
  year =         "2013",
  CODEN =        "AALEE5",
  DOI =          "https://doi.org/10.1145/2658982.2527285",
  ISSN =         "1094-3641 (print), 1557-9476 (electronic)",
  ISSN-L =       "1094-3641",
  bibdate =      "Wed Sep 3 16:38:30 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigada.bib",
  abstract =     "This tutorial will introduce the attendees to analysis
                 and proof techniques for programs using parallelism and
                 multi-threading. There are no specific prerequisites,
                 but a familiarity with the notions of preconditions and
                 postconditions, aliasing, race conditions, and
                 deadlocks would be of value. The examples will be based
                 on the threading and parallelism models of Java, Ada,
                 and two new parallel languages, one called ParaSail [4]
                 and another, inspired by the verifiable SPARK[1][2]
                 subset of Ada, called Sparkel[3]. We will introduce the
                 distinction between safety and liveness properties, and
                 then focus primarily on techniques for the verification
                 of safety properties, including the absence of race
                 conditions and deadlocks. We will also discuss the
                 issue of determinism vs. non-determinism in parallel
                 and multi-threaded programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGADA Ada Letters",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J32",
  remark =       "HILT '13 conference proceedings.",
}

@Article{Tembey:2013:SSS,
  author =       "Priyanka Tembey and Augusto Vega and Alper
                 Buyuktosunoglu and Dilma Da Silva and Pradip Bose",
  title =        "{SMT} switch: Software Mechanisms for Power Shifting",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "67--70",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous multithreading (SMT) as a processor
                 design to achieve higher levels of system and
                 application throughput is a well-accepted and deployed
                 technique in most desktop and server processors. We
                 study the power implications of varying SMT levels
                 i.e., thread counts per core for various multi-threaded
                 applications on a real SMT multicore platform, and
                 introduce a novel software mechanism of changing SMT
                 level of a core to tune platform power. Power-shifting
                 policies by varying per core SMT levels for performance
                 benefits within a power cap are introduced. Projected
                 power savings (of 15\%) for a streaming parallel
                 benchmark can be attained using SMT-level power
                 shifting mechanisms.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA
                 30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA
                 30332 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Multicore platforms; Operating Systems; Power
                 shifting; SMT",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Tembey:2013:SSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wester:2013:PDR,
  author =       "Benjamin Wester and David Devecsery and Peter M. Chen
                 and Jason Flinn and Satish Narayanasamy",
  title =        "Parallelizing data race detection",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "27--38",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451120",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Detecting data races in multithreaded programs is a
                 crucial part of debugging such programs, but
                 traditional data race detectors are too slow to use
                 routinely. This paper shows how to speed up race
                 detection by spreading the work across multiple cores.
                 Our strategy relies on uniparallelism, which executes
                 time intervals of a program (called epochs ) in
                 parallel to provide scalability, but executes all
                 threads from a single epoch on a single core to
                 eliminate locking overhead. We use several techniques
                 to make parallelization effective: dividing race
                 detection into three phases, predicting a subset of the
                 analysis state, eliminating sequential work via
                 transitive reduction, and reducing the work needed to
                 maintain multiple versions of analysis via
                 factorization. We demonstrate our strategy by
                 parallelizing a happens-before detector and a
                 lockset-based detector. We find that uniparallelism can
                 significantly speed up data race detection. With 4x the
                 number of cores as the original application, our
                 strategy speeds up the median execution time by 4.4x
                 for a happens-before detector and 3.3x for a lockset
                 race detector. Even on the same number of cores as the
                 conventional detectors, the ability for uniparallelism
                 to elide analysis locks allows it to reduce the median
                 overhead by 13\% for a happens-before detector and 8\%
                 for a lockset detector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Yu:2013:GDS,
  author =       "Hongtao Yu and Hou-Jen Ko and Zhiyuan Li",
  title =        "General data structure expansion for multi-threading",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "243--252",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462182",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Among techniques for parallelizing sequential codes,
                 privatization is a common and significant
                 transformation performed by both compilers and runtime
                 parallelizing systems. Without privatization,
                 repetitive updates to the same data structures often
                 introduce spurious data dependencies that hide the
                 inherent parallelism. Unfortunately, it remains a
                 significant challenge to compilers to automatically
                 privatize dynamic and recursive data structures which
                 appear frequently in real applications written in
                 languages such as C/C++. This is because such languages
                 lack a naming mechanism to define the address range of
                 a pointer-based data structure, in contrast to arrays
                 with explicitly declared bounds. In this paper we
                 present a novel solution to this difficult problem by
                 expanding general data structures such that memory
                 accesses issued from different threads to contentious
                 data structures are directed to different data fields.
                 Based on compile-time type checking and a data
                 dependence graph, this aggressive extension to the
                 traditional scalar and array expansion isolates the
                 address ranges among different threads, without
                 struggling with privatization based on thread-private
                 stacks, such that the targeted loop can be effectively
                 parallelized. With this method fully implemented in
                 GCC, experiments are conducted on a set of programs
                 from well-known benchmark suites such as Mibench,
                 MediaBench II and SPECint. Results show that the new
                 approach can lead to a high speedup when executing the
                 transformed code on multiple cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Zarrabi:2013:LSF,
  author =       "Amirreza Zarrabi and Khairulmizam Samsudin and Wan
                 Azizun Wan Adnan",
  title =        "{Linux} Support for Fast Transparent General Purpose
                 Checkpoint\slash Restart of Multithreaded Processes in
                 Loadable Kernel Module",
  journal =      j-J-GRID-COMP,
  volume =       "11",
  number =       "2",
  pages =        "187--210",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10723-013-9248-5",
  ISSN =         "1570-7873 (print), 1572-9184 (electronic)",
  ISSN-L =       "1570-7873",
  bibdate =      "Sat Jun 22 11:03:44 MDT 2013",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1570-7873&volume=11&issue=2;
                 https://www.math.utah.edu/pub/tex/bib/jgridcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s10723-013-9248-5",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Grid Computing",
  journal-URL =  "http://link.springer.com/journal/10723",
}

@Article{Awile:2014:PWF,
  author =       "Omar Awile and Ivo F. Sbalzarini",
  title =        "A {Pthreads} Wrapper for {Fortran 2003}",
  journal =      j-TOMS,
  volume =       "40",
  number =       "3",
  pages =        "19:1--19:15",
  month =        apr,
  year =         "2014",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2558889",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon Apr 21 17:42:14 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "With the advent of multicore processors, numerical and
                 mathematical software relies on parallelism in order to
                 benefit from hardware performance increases. We present
                 the design and use of a Fortran 2003 wrapper for POSIX
                 threads, called forthreads. Forthreads is complete in
                 the sense that is provides native Fortran 2003
                 interfaces to all pthreads routines where possible. We
                 demonstrate the use and efficiency of forthreads for
                 SIMD parallelism and task parallelism. We present
                 forthreads/MPI implementations that enable hybrid
                 shared-/distributed-memory parallelism in Fortran 2003.
                 Our benchmarks show that forthreads offers performance
                 comparable to that of OpenMP, but better thread control
                 and more freedom. We demonstrate the latter by
                 presenting a multithreaded Fortran 2003 library for
                 POSIX Internet sockets, enabling interactive numerical
                 simulations with runtime control.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Bartolini:2014:AFG,
  author =       "Davide B. Bartolini and Filippo Sironi and Donatella
                 Sciuto and Marco D. Santambrogio",
  title =        "Automated Fine-Grained {CPU} Provisioning for Virtual
                 Machines",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637480",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Ideally, the pay-as-you-go model of Infrastructure as
                 a Service (IaaS) clouds should enable users to rent
                 just enough resources (e.g., CPU or memory bandwidth)
                 to fulfill their service level objectives (SLOs).
                 Achieving this goal is hard on current IaaS offers,
                 which require users to explicitly specify the amount of
                 resources to reserve; this requirement is nontrivial
                 for users, because estimating the amount of resources
                 needed to attain application-level SLOs is often
                 complex, especially when resources are virtualized and
                 the service provider colocates virtual machines (VMs)
                 on host nodes. For this reason, users who deploy VMs
                 subject to SLOs are usually prone to overprovisioning
                 resources, thus resulting in inflated business costs.
                 This article tackles this issue with AutoPro: a runtime
                 system that enhances IaaS clouds with automated and
                 fine-grained resource provisioning based on performance
                 SLOs. Our main contribution with AutoPro is filling the
                 gap between application-level performance SLOs and
                 allocation of a contended resource, without requiring
                 explicit reservations from users. In this article, we
                 focus on CPU bandwidth allocation to throughput-driven,
                 compute-intensive multithreaded applications colocated
                 on a multicore processor; we show that a theoretically
                 sound, yet simple, control strategy can enable
                 automated fine-grained allocation of this contended
                 resource, without the need for offline profiling.
                 Additionally, AutoPro helps service providers optimize
                 infrastructure utilization by provisioning idle
                 resources to best-effort workloads, so as to maximize
                 node-level utilization. Our extensive experimental
                 evaluation confirms that AutoPro is able to
                 automatically determine and enforce allocations to meet
                 performance SLOs while maximizing node-level
                 utilization by supporting batch workloads on a
                 best-effort basis.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bergan:2014:SEM,
  author =       "Tom Bergan and Dan Grossman and Luis Ceze",
  title =        "Symbolic execution of multithreaded programs from
                 arbitrary program contexts",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "491--506",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660200",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe an algorithm to perform symbolic execution
                 of a multithreaded program starting from an arbitrary
                 program context. We argue that this can enable more
                 efficient symbolic exploration of deep code paths in
                 multithreaded programs by allowing the symbolic engine
                 to jump directly to program contexts of interest. The
                 key challenge is modeling the initial context with
                 reasonable precision --- an overly approximate model
                 leads to exploration of many infeasible paths during
                 symbolic execution, while a very precise model would be
                 so expensive to compute that computing it would defeat
                 the purpose of jumping directly to the initial context
                 in the first place. We propose a context-specific
                 dataflow analysis that approximates the initial context
                 cheaply, but precisely enough to avoid some common
                 causes of infeasible-path explosion. This model is
                 necessarily approximate --- it may leave portions of
                 the memory state unconstrained, leaving our symbolic
                 execution unable to answer simple questions such as
                 ``which thread holds lock A?''. For such cases, we
                 describe a novel algorithm for evaluating symbolic
                 synchronization during symbolic execution. Our symbolic
                 execution semantics are sound and complete up to the
                 limits of the underlying SMT solver. We describe
                 initial experiments on an implementation in Cloud 9.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Bokhari:2014:MMM,
  author =       "Shahid H. Bokhari and {\"U}mit V. {\c{C}}ataly{\"u}rek
                 and Metin N. Gurcan",
  title =        "Massively multithreaded maxflow for image segmentation
                 on the {Cray XMT-2}",
  journal =      j-CCPE,
  volume =       "26",
  number =       "18",
  pages =        "2836--2855",
  day =          "25",
  month =        dec,
  year =         "2014",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3181",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Feb 11 22:34:11 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "5 Dec 2013",
}

@Book{Butcher:2014:SCM,
  author =       "Paul N. Butcher",
  title =        "Seven concurrency models in seven weeks: when threads
                 unravel",
  publisher =    "The Pragmatic Bookshelf",
  address =      "Dallas, TX, USA",
  pages =        "xiii + 275",
  year =         "2014",
  ISBN =         "1-937785-65-3 (paperback), 1-941222-27-7 (e-book)",
  ISBN-13 =      "978-1-937785-65-9 (paperback), 978-1-941222-27-0
                 (e-book)",
  LCCN =         "QA76.642 .B88 2014",
  bibdate =      "Thu Dec 4 13:32:20 MST 2014",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "The Pragmatic Programmers",
  URL =          "http://proquest.safaribooksonline.com/?fpi=9781941222737",
  acknowledgement = ack-nhfb,
  subject =      "Computer multitasking; Parallel programming (Computer
                 science); Nebenl{\"a}ufigkeit; Parallelverarbeitung",
  tableofcontents = "Introduction \\
                 Threads and locks \\
                 Functional programming \\
                 The Clojure way: separating identity from state \\
                 Actors \\
                 Communicating sequential processes \\
                 Data parallelism \\
                 The Lambda Architecture \\
                 Wrapping up",
}

@Article{Cai:2014:MSD,
  author =       "Y. Cai and W. K. Chan",
  title =        "{Magiclock}: Scalable Detection of Potential Deadlocks
                 in Large-Scale Multithreaded Programs",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "40",
  number =       "3",
  pages =        "266--281",
  month =        mar,
  year =         "2014",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2014.2301725",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Feb 1 19:49:24 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6718069",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Article{Catano:2014:CSL,
  author =       "N{\'e}stor Cata{\~n}o and Ijaz Ahmed and Radu I.
                 Siminiceanu and Jonathan Aldrich",
  title =        "A case study on the lightweight verification of a
                 multi-threaded task server",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "80",
  number =       "??",
  pages =        "169--187",
  day =          "1",
  month =        feb,
  year =         "2014",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Sat Nov 30 15:06:16 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib;
                 http://www.sciencedirect.com/science/journal/01676423",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167642313000178",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
}

@Article{Che:2014:ALM,
  author =       "Hao Che and Minh Nguyen",
  title =        "{Amdahl's Law} for multithreaded multicore
                 processors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "74",
  number =       "10",
  pages =        "3056--3069",
  month =        oct,
  year =         "2014",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Aug 21 16:26:06 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731514001142",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315/",
}

@Article{David:2014:CMC,
  author =       "Florian David and Gael Thomas and Julia Lawall and
                 Gilles Muller",
  title =        "Continuously measuring critical section pressure with
                 the free-lunch profiler",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "291--307",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today, Java is regularly used to implement large
                 multi-threaded server-class applications that use locks
                 to protect access to shared data. However,
                 understanding the impact of locks on the performance of
                 a system is complex, and thus the use of locks can
                 impede the progress of threads on configurations that
                 were not anticipated by the developer, during specific
                 phases of the execution. In this paper, we propose Free
                 Lunch, a new lock profiler for Java application
                 servers, specifically designed to identify, in-vivo,
                 phases where the progress of the threads is impeded by
                 a lock. Free Lunch is designed around a new metric,
                 critical section pressure (CSP), which directly
                 correlates the progress of the threads to each of the
                 locks. Using Free Lunch, we have identified phases of
                 high CSP, which were hidden with other lock profilers,
                 in the distributed Cassandra NoSQL database and in
                 several applications from the DaCapo 9.12, the
                 SPECjvm2008 and the SPECjbb2005 benchmark suites. Our
                 evaluation of Free Lunch shows that its overhead is
                 never greater than 6\%, making it suitable for in-vivo
                 use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Esparza:2014:PBV,
  author =       "Javier Esparza and Pierre Ganty and Tom{\'a}s Poch",
  title =        "Pattern-Based Verification for Multithreaded
                 Programs",
  journal =      j-TOPLAS,
  volume =       "36",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2014",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2629644",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Oct 28 17:06:29 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Pattern-based verification checks the correctness of
                 program executions that follow a given pattern, a
                 regular expression over the alphabet of program
                 transitions of the form $ w_1^*, \ldots {},_n^* $ w.
                 For multithreaded programs, the alphabet of the pattern
                 is given by the reads and writes to the shared storage.
                 We study the complexity of pattern-based verification
                 for multithreaded programs with shared counters and
                 finite variables. While unrestricted verification is
                 undecidable for abstracted multithreaded programs with
                 recursive procedures and PSPACE-complete for abstracted
                 multithreaded while-programs (even without counters),
                 we show that pattern-based verification is NP-complete
                 for both classes, even in the presence of counters. We
                 then conduct a multiparameter analysis to study the
                 complexity of the problem on its three natural
                 parameters (number of threads+counters+variables,
                 maximal size of a thread, size of the pattern) and on
                 two parameters related to thread structure (maximal
                 number of procedures per thread and longest simple path
                 of procedure calls). We present an algorithm that for a
                 fixed number of threads, counters, variables, and
                 pattern size solves the verification problem in $ {\rm
                 st}^{O ({\rm lsp} + \lceil log ({\rm pr} + 1) \rceil)}
                 $ time, where $ {\rm st} $ is the maximal size of a
                 thread, $ {\rm pr} $ is the maximal number of
                 procedures per thread, and $ {\rm lsp} $ is the longest
                 simple path of procedure calls.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Eyerman:2014:RCW,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Restating the Case for Weighted-{IPC} Metrics to
                 Evaluate Multiprogram Workload Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "93--96",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Weighted speedup is nowadays the most commonly used
                 multiprogram workload performance metric. Weighted
                 speedup is a weighted-IPC metric, i.e., the
                 multiprogram IPC of each program is first weighted with
                 its isolated IPC. Recently, Michaud questions the
                 validity of weighted-IPC metrics by arguing that they
                 are inconsistent and that weighted speedup favors
                 unfairness [4]. Instead, he advocates using the
                 arithmetic or harmonic mean of the raw IPC values of
                 the programs in the multiprogram workload. We show that
                 weighted-IPC metrics are not inconsistent, and that
                 weighted speedup is fair in giving equal importance to
                 each program. We argue that, in contrast to raw-IPC
                 metrics, weighted-IPC metrics have a system-level
                 meaning, and that raw-IPC metrics are affected by the
                 inherent behavior of the programs. We also show that
                 the choice of a metric may adversely affect the
                 conclusions from an experiment. We suggest to use two
                 weighted-IPC metrics-system throughput (STP) and
                 average normalized turnaround time (ANTT)-for
                 evaluating multiprogram workload performance, and to
                 avoid raw-IPC metrics.",
  acknowledgement = ack-nhfb,
  affiliation =  "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent,
                 Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent,
                 B-9000 Ghent, Belgium.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Research Foundation --- Flanders (FWO);
                 European Research Council under the European Community
                 [259295]",
  funding-text = "Stijn Eyerman is supported through a postdoctoral
                 fellowship by the Research Foundation --- Flanders
                 (FWO). Additional support is provided by the European
                 Research Council under the European Community's Seventh
                 Framework Programme (FP7/2007-2013) / ERC Grant
                 agreement no. 259295.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ANTT; average normalized turnaround time; Benchmark
                 testing; C Computer Systems Organization; C.1 Processor
                 Architectures; C.1.3 Other Architecture Styles; C.1.3.h
                 Multithreaded processors; C.1.4 Parallel Architectures;
                 C.1.4.e Multi-core/single-chip multiprocessors; C.4
                 Performance of Systems; C.4.c Measurement techniques;
                 Degradation; Harmonic analysis; harmonic mean;
                 Multicore processing; multiprocessing systems;
                 multiprogram IPC; multiprogram workload performance
                 metric; multiprogramming; raw-IPC metrics; STP; system
                 throughput; system-level meaning; Throughput; Weight
                 measurement; weighted speedup; weighted-IPC metric",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Eyerman:2014:RCW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Fabregat-Traver:2014:SSG,
  author =       "Diego Fabregat-Traver and Yurii S. Aulchenko and Paolo
                 Bientinesi",
  title =        "Solving sequences of generalized least-squares
                 problems on multi-threaded architectures",
  journal =      j-APPL-MATH-COMP,
  volume =       "234",
  number =       "??",
  pages =        "606--617",
  day =          "15",
  month =        may,
  year =         "2014",
  CODEN =        "AMHCBQ",
  ISSN =         "0096-3003 (print), 1873-5649 (electronic)",
  ISSN-L =       "0096-3003",
  bibdate =      "Mon Apr 21 18:04:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/applmathcomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0096300314002951",
  acknowledgement = ack-nhfb,
  fjournal =     "Applied Mathematics and Computation",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00963003/",
}

@Article{Frincu:2014:ESV,
  author =       "Marc E. Frincu and St{\'e}phane Genaud and Julien
                 Gossa",
  title =        "On the efficiency of several {VM} provisioning
                 strategies for workflows with multi-threaded tasks on
                 clouds",
  journal =      j-COMPUTING,
  volume =       "96",
  number =       "11",
  pages =        "1059--1086",
  month =        nov,
  year =         "2014",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-014-0410-0",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Wed Feb 11 07:42:25 MST 2015",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=11;
                 https://www.math.utah.edu/pub/tex/bib/computing.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-014-0410-0",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Gerakios:2014:SSG,
  author =       "Prodromos Gerakios and Nikolaos Papaspyrou and
                 Konstantinos Sagonas",
  title =        "Static safety guarantees for a low-level multithreaded
                 language with regions",
  journal =      j-SCI-COMPUT-PROGRAM,
  volume =       "80",
  number =       "??",
  pages =        "223--263",
  day =          "1",
  month =        feb,
  year =         "2014",
  CODEN =        "SCPGD4",
  ISSN =         "0167-6423 (print), 1872-7964 (electronic)",
  ISSN-L =       "0167-6423",
  bibdate =      "Sat Nov 30 15:06:20 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib;
                 http://www.sciencedirect.com/science/journal/01676423",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167642313001433",
  acknowledgement = ack-nhfb,
  fjournal =     "Science of Computer Programming",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01676423",
}

@Article{Giceva:2014:DQP,
  author =       "Jana Giceva and Gustavo Alonso and Timothy Roscoe and
                 Tim Harris",
  title =        "Deployment of query plans on multicores",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "8",
  number =       "3",
  pages =        "233--244",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  ISSN =         "2150-8097",
  ISSN-L =       "2150-8097",
  bibdate =      "Mon Feb 9 18:24:34 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "Efficient resource scheduling of multithreaded
                 software on multicore hardware is difficult given the
                 many parameters involved and the hardware heterogeneity
                 of existing systems. In this paper we explore the
                 efficient deployment of query plans over a multicore
                 machine. We focus on shared query systems, and
                 implement the proposed ideas using SharedDB. The goal
                 of the paper is to explore how to deliver maximum
                 performance and predictability, while minimizing
                 resource utilization when deploying query plans on
                 multicore machines. We propose to use resource activity
                 vectors to characterize the behavior of individual
                 database operators. We then present a novel deployment
                 algorithm which uses these vectors together with
                 dataflow information from the query plan to optimally
                 assign relational operators to physical cores.
                 Experiments demonstrate that this approach
                 significantly reduces resource requirements while
                 preserving performance and is robust across different
                 server architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Gonzalez-Mesa:2014:ETM,
  author =       "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L.
                 Zapata and Oscar Plata",
  title =        "Effective Transactional Memory Execution Management
                 for Improved Concurrency",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "24:1--24:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2633048",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes a transactional memory
                 execution model intended to exploit maximum parallelism
                 from sequential and multithreaded programs. A program
                 code section is partitioned into chunks that will be
                 mapped onto threads and executed transactionally. These
                 transactions run concurrently and out of order, trying
                 to exploit maximum parallelism but managed by a
                 specific fully distributed commit control to meet data
                 dependencies. To accomplish correct parallel execution,
                 a partial precedence order relation is derived from the
                 program code section and/or defined by the programmer.
                 When a conflict between chunks is eagerly detected, the
                 precedence order relation is used to determine the best
                 policy to solve the conflict that preserves the
                 precedence order while maximizing concurrency. The
                 model defines a new transactional state called executed
                 but not committed. This state allows exploiting
                 concurrency on two levels: intrathread and interthread.
                 Intrathread concurrency is improved by having pending
                 uncommitted transactions while executing a new one in
                 the same thread. The new state improves interthread
                 concurrency because it permits out-of-order transaction
                 commits regarding the precedence order. Our model has
                 been implemented in a lightweight software
                 transactional memory system, TinySTM, and has been
                 evaluated on a set of benchmarks obtaining an important
                 performance improvement over the baseline TM system.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Guzzi:2014:CPP,
  author =       "P. H. Guzzi and G. Agapito and M. Cannataro",
  title =        "{coreSNP}: Parallel Processing of Microarray Data",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "63",
  number =       "12",
  pages =        "2961--2974",
  month =        dec,
  year =         "2014",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2013.176",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Dec 4 10:36:57 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "Affymetrix; bioinformatics; Bioinformatics;
                 Bioinformatics (genome or protein) databases; coreSNP
                 parallel software tool; distributed programming;
                 distributed systems; DMET SNP microarray data; DNA;
                 Drug Metabolism Enzymes and Transporters; drug
                 response; drug therapy improvement; drug toxicity;
                 Drugs; drugs; enzymes; experimental data analysis;
                 experimental data preprocessing; experimental data
                 storage; gene expression; genetic variation; genetics;
                 Genomics; genomics; genomics diffusion; graphical user
                 interface; graphical user interfaces; health care;
                 healthcare; high-throughput technologies; information
                 retrieval; lab-on-a-chip; maximum drug efficacy;
                 medical information systems; microarray data; minimal
                 adverse effects; multi-threading; next generation
                 sequencing; parallel processing; Parallel processing;
                 patient genotype; performance evaluation;
                 pharmacogenomics analysis pipeline; response times;
                 scalable multithreaded implementation;
                 single-nucleotide polymorphisms; SNP annotation;
                 statistical analysis; Statistical analysis; statistical
                 software; Throughput",
}

@Article{Hayden:2014:KEG,
  author =       "Christopher M. Hayden and Karla Saur and Edward K.
                 Smith and Michael Hicks and Jeffrey S. Foster",
  title =        "{Kitsune}: Efficient, General-Purpose Dynamic Software
                 Updating for {C}",
  journal =      j-TOPLAS,
  volume =       "36",
  number =       "4",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2014",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2629460",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Oct 28 17:05:40 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Dynamic software updating (DSU) systems facilitate
                 software updates to running programs, thereby
                 permitting developers to add features and fix bugs
                 without downtime. This article introduces Kitsune, a
                 DSU system for C. Kitsune's design has three notable
                 features. First, Kitsune updates the whole program,
                 rather than individual functions, using a mechanism
                 that places no restrictions on data representations or
                 allowed compiler optimizations. Second, Kitsune makes
                 the important aspects of updating explicit in the
                 program text, making the program's semantics easy to
                 understand while minimizing programmer effort. Finally,
                 the programmer can write simple specifications to
                 direct Kitsune to generate code that traverses and
                 transforms old-version state for use by new code; such
                 state transformation is often necessary and is
                 significantly more difficult in prior DSU systems. We
                 have used Kitsune to update six popular, open-source,
                 single- and multithreaded programs and find that few
                 program changes are required to use Kitsune, that it
                 incurs essentially no performance overhead, and that
                 update times are fast.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Honarmand:2014:RRR,
  author =       "Nima Honarmand and Josep Torrellas",
  title =        "{RelaxReplay}: record and replay for
                 relaxed-consistency multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "223--238",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541979",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:47 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Record and Deterministic Replay (RnR) of multithreaded
                 programs on relaxed-consistency multiprocessors has
                 been a long-standing problem. While there are designs
                 that work for Total Store Ordering (TSO), finding a
                 general solution that is able to record the access
                 reordering allowed by any relaxed-consistency model has
                 proved challenging. This paper presents the first
                 complete solution for hard-ware-assisted memory race
                 recording that works for any relaxed-consistency model
                 of current processors. With the scheme, called
                 RelaxReplay, we can build an RnR system for any
                 relaxed-consistency model and coherence protocol.
                 RelaxReplay's core innovation is a new way of capturing
                 memory access reordering. Each memory instruction goes
                 through a post-completion in-order counting step that
                 detects any reordering, and efficiently records it. We
                 evaluate RelaxReplay with simulations of an 8-core
                 release-consistent multicore running SPLASH-2 programs.
                 We observe that RelaxReplay induces negligible overhead
                 during recording. In addition, the average size of the
                 log produced is comparable to the log sizes reported
                 for existing solutions, and still very small compared
                 to the memory bandwidth of modern machines. Finally,
                 deterministic replay is efficient and needs minimal
                 hardware support.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Kaiser:2014:WAM,
  author =       "Alexander Kaiser and Daniel Kroening and Thomas Wahl",
  title =        "A Widening Approach to Multithreaded Program
                 Verification",
  journal =      j-TOPLAS,
  volume =       "36",
  number =       "4",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2014",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2629608",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Oct 28 17:05:40 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Pthread-style multithreaded programs feature rich
                 thread communication mechanisms, such as shared
                 variables, signals, and broadcasts. In this article, we
                 consider the automated verification of such programs
                 where an unknown number of threads execute a given
                 finite-data procedure in parallel. Such procedures are
                 typically obtained as predicate abstractions of
                 recursion-free source code written in C or Java. Many
                 safety problems over finite-data replicated
                 multithreaded programs are decidable via a reduction to
                 the coverability problem in certain types of
                 well-ordered infinite-state transition systems. On the
                 other hand, in full generality, this problem is
                 Ackermann-hard, which seems to rule out efficient
                 algorithmic treatment. We present a novel, sound, and
                 complete yet empirically efficient solution. Our
                 approach is to judiciously widen the original set of
                 coverability targets by configurations that involve
                 fewer threads and are thus easier to decide, and whose
                 exploration may well be sufficient: if they turn out
                 uncoverable, so are the original targets. To soften the
                 impact of ``bad guesses''-configurations that turn out
                 coverable-the exploration is accompanied by a parallel
                 engine that generates coverable configurations; none of
                 these is ever selected for widening. Its job being
                 merely to prevent bad widening choices, such an engine
                 need not be complete for coverability analysis, which
                 enables a range of existing partial (e.g.,
                 nonterminating) techniques. We present extensive
                 experiments on multithreaded C programs, including
                 device driver code from FreeBSD, Solaris, and Linux
                 distributions. Our approach outperforms existing
                 coverability methods by orders of magnitude.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Kim:2014:SMC,
  author =       "S. Kim",
  title =        "Synthesizing Multithreaded Code from Real-Time
                 Object-Oriented Models via Schedulability-Aware Thread
                 Derivation",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "40",
  number =       "4",
  pages =        "413--426",
  month =        apr,
  year =         "2014",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2013.47",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Feb 1 19:49:24 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6617637",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@InProceedings{Knopp:2014:EMT,
  author =       "T. Knopp",
  booktitle =    "{2014 First Workshop for High Performance Technical
                 Computing in Dynamic Languages}",
  title =        "Experimental Multi-threading Support for the {Julia}
                 Programming Language",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "1--5",
  year =         "2014",
  DOI =          "https://doi.org/10.1109/HPTCDL.2014.11",
  bibdate =      "Thu Apr 8 07:17:08 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/julia.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Julia programming language",
}

@Article{Kvatinsky:2014:MBM,
  author =       "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion
                 and Eby G. Friedman and Avinoam Kolodny and Uri C.
                 Weiser",
  title =        "Memristor-Based Multithreading",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "41--44",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Switch on Event Multithreading (SoE MT, also known as
                 coarse-grained MT and block MT) processors run multiple
                 threads on a pipeline machine, while the pipeline
                 switches threads on stall events (e.g., cache miss).
                 The thread switch penalty is determined by the number
                 of stages in the pipeline that are flushed of in-flight
                 instructions. In this paper, Continuous Flow
                 Multithreading (CFMT), a new architecture of SoE MT, is
                 introduced. In CFMT, a multistate pipeline register
                 (MPR) holds the microarchitectural state of multiple
                 different threads within the execution pipeline stages,
                 where only one thread is active at a time. The MPRs
                 eliminate the need to flush in-flight instructions and
                 therefore significantly improve performance. In recent
                 years, novel memory technologies such as Resistive RAM
                 (RRAM) and Spin Torque Transfer Magnetoresistive RAM
                 (STT-MRAM), have been developed. All of these
                 technologies are nonvolatile, store data as resistance,
                 and can be described as ``{memristors.''} Memristors
                 are power efficient, dense, and fast as compared to
                 standard memory technologies such as SRAM, DRAM, and
                 Flash. Memristors therefore provide the opportunity to
                 place the MPRs physically within the pipeline stages. A
                 performance analysis of CFMT is compared to
                 conventional SoE MT processors, demonstrating up to a
                 2X performance improvement, while the operational
                 mechanism, due to the use of memristors, is low power
                 and low complexity as compared to conventional SoE MT
                 processors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kvatinsky, S (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam;
                 Weiser, Uri C., Technion Israel Inst Technol, Dept
                 Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav,
                 Technion Israel Inst Technol, Dept Comp Sci, IL-32000
                 Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept
                 Elect \& Comp Engn, Rochester, NY 14627 USA.",
  author-email = "skva@tx.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Hasso Plattner Institute",
  funding-text = "This work was supported by the Hasso Plattner
                 Institute. The authors thank Ravi Patel for his
                 comments and area overhead estimation and to Nimrod
                 Wald and Guy Satat for their help in evaluating the
                 architecture.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "memristor; multithreaded processors; phase change
                 memory; RRAM, STT-MRAM",
  keywords-plus = "RESISTIVE SWITCHING MEMORIES",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Kvatinsky:2014:MBM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2014:PDC,
  author =       "Yong Li and R. Melhem and A. K. Jones",
  title =        "A Practical Data Classification Framework for Scalable
                 and High Performance Chip-Multiprocessors",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "63",
  number =       "12",
  pages =        "2905--2918",
  month =        dec,
  year =         "2014",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2013.161",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Dec 4 10:36:57 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "application-specific characteristics; Benchmark
                 testing; cache coherence; cache coherence design; cache
                 storage; chip multiprocessor; Coherence; coherence
                 directory; coherence overhead mitigation; coherence
                 traffic; compiler-assisted mechanism; compilers; data
                 access behavior; data access latency mitigation; data
                 classification; data classification scheme; Dynamic
                 scheduling; Instruction sets; interconnect; many-core
                 architectures; microarchitectural constructs;
                 multi-threaded parallel; NUCA-based caching; OpenMP;
                 Optimization; parallel applications; parallel
                 architectures; pattern classification; performance
                 evaluation; performance improvement; pipelined
                 parallel; Practically private; practically private;
                 program compilers; Resource management; Runtime;
                 scalable high-performance parallel systems; TLB;
                 ubiquitous computing",
}

@Article{Liu:2014:PPF,
  author =       "Tongping Liu and Chen Tian and Ziang Hu and Emery D.
                 Berger",
  title =        "{PREDATOR}: predictive false sharing detection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "3--14",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555244",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "False sharing is a notorious problem for multithreaded
                 applications that can drastically degrade both
                 performance and scalability. Existing approaches can
                 precisely identify the sources of false sharing, but
                 only report false sharing actually observed during
                 execution; they do not generalize across executions.
                 Because false sharing is extremely sensitive to object
                 layout, these detectors can easily miss false sharing
                 problems that can arise due to slight differences in
                 memory allocation order or object placement decisions
                 by the compiler. In addition, they cannot predict the
                 impact of false sharing on hardware with different
                 cache line sizes. This paper presents PREDATOR, a
                 predictive software-based false sharing detector.
                 PREDATOR generalizes from a single execution to
                 precisely predict false sharing that is latent in the
                 current execution. PREDATOR tracks accesses within a
                 range that could lead to false sharing given different
                 object placement. It also tracks accesses within
                 virtual cache lines, contiguous memory ranges that span
                 actual hardware cache lines, to predict sharing on
                 hardware platforms with larger cache line sizes. For
                 each, it reports the exact program location of
                 predicted false sharing problems, ranked by their
                 projected impact on performance. We evaluate PREDATOR
                 across a range of benchmarks and actual applications.
                 PREDATOR identifies problems undetectable with previous
                 tools, including two previously-unknown false sharing
                 problems, with no false positives. PREDATOR is able to
                 immediately locate false sharing problems in MySQL and
                 the Boost library that had eluded detection for
                 years.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Liu:2014:TAP,
  author =       "Xu Liu and John Mellor-Crummey",
  title =        "A tool to analyze the performance of multithreaded
                 programs on {NUMA} architectures",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "259--272",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555271",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Almost all of today's microprocessors contain memory
                 controllers and directly attach to memory. Modern
                 multiprocessor systems support non-uniform memory
                 access (NUMA): it is faster for a microprocessor to
                 access memory that is directly attached than it is to
                 access memory attached to another processor. Without
                 careful distribution of computation and data, a
                 multithreaded program running on such a system may have
                 high average memory access latency. To use
                 multiprocessor systems efficiently, programmers need
                 performance tools to guide the design of NUMA-aware
                 codes. To address this need, we enhanced the HPCToolkit
                 performance tools to support measurement and analysis
                 of performance problems on multiprocessor systems with
                 multiple NUMA domains. With these extensions,
                 HPCToolkit helps pinpoint, quantify, and analyze NUMA
                 bottlenecks in executions of multithreaded programs. It
                 computes derived metrics to assess the severity of
                 bottlenecks, analyzes memory accesses, and provides a
                 wealth of information to guide NUMA optimization,
                 including information about how to distribute data to
                 reduce access latency and minimize contention. This
                 paper describes the design and implementation of our
                 extensions to HPCToolkit. We demonstrate their utility
                 by describing case studies in which we use these
                 capabilities to diagnose NUMA bottlenecks in four
                 multithreaded applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Liu:2014:TPA,
  author =       "Bin Liu and Yinliang Zhao and Yuxiang Li and Yanjun
                 Sun and Boqin Feng",
  title =        "A thread partitioning approach for speculative
                 multithreading",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "67",
  number =       "3",
  pages =        "778--805",
  month =        mar,
  year =         "2014",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-013-1000-1",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Mar 8 14:59:14 MST 2014",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=67&issue=3;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-013-1000-1",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Lu:2014:EDM,
  author =       "Kai Lu and Xu Zhou and Tom Bergan and Xiaoping Wang",
  title =        "Efficient deterministic multithreading without global
                 barriers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "287--300",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555252",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multithreaded programs execute nondeterministically on
                 conventional architectures and operating systems. This
                 complicates many tasks, including debugging and
                 testing. Deterministic multithreading (DMT) makes the
                 output of a multithreaded program depend on its inputs
                 only, which can totally solve the above problem.
                 However, current DMT implementations suffer from a
                 common inefficiency: they use frequent global barriers
                 to enforce a deterministic ordering on memory accesses.
                 In this paper, we eliminate that inefficiency using an
                 execution model we call deterministic lazy release
                 consistency (DLRC). Our execution model uses the Kendo
                 algorithm to enforce a deterministic ordering on
                 synchronization, and it uses a deterministic version of
                 the lazy release consistency memory model to propagate
                 memory updates across threads. Our approach guarantees
                 that programs execute deterministically even when they
                 contain data races. We implemented a DMT system based
                 on these ideas (RFDet) and evaluated it using 16
                 parallel applications. Our implementation targets C/C++
                 programs that use POSIX threads. Results show that
                 RFDet gains nearly 2x speedup compared with DThreads-a
                 start-of-the-art DMT system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Maiya:2014:RDA,
  author =       "Pallavi Maiya and Aditya Kanade and Rupak Majumdar",
  title =        "Race detection for {Android} applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "316--325",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594311",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming environments for smartphones expose a
                 concurrency model that combines multi-threading and
                 asynchronous event-based dispatch. While this enables
                 the development of efficient and feature-rich
                 applications, unforeseen thread interleavings coupled
                 with non-deterministic reorderings of asynchronous
                 tasks can lead to subtle concurrency errors in the
                 applications. In this paper, we formalize the
                 concurrency semantics of the Android programming model.
                 We further define the happens-before relation for
                 Android applications, and develop a dynamic race
                 detection technique based on this relation. Our
                 relation generalizes the so far independently studied
                 happens-before relations for multi-threaded programs
                 and single-threaded event-driven programs.
                 Additionally, our race detection technique uses a model
                 of the Android runtime environment to reduce false
                 positives. We have implemented a tool called
                 DroidRacer. It generates execution traces by
                 systematically testing Android applications and detects
                 data races by computing the happens-before relation on
                 the traces. We analyzed 15 Android applications
                 including popular applications such as Facebook,
                 Twitter and K-9 Mail. Our results indicate that data
                 races are prevalent in Android applications, and that
                 DroidRacer is an effective tool to identify data
                 races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Martinsen:2014:HTL,
  author =       "Jan Kasper Martinsen and Hakan Grahn and Anders
                 Isberg",
  title =        "Heuristics for Thread-Level Speculation in {Web}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "77--80",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "JavaScript is a sequential programming language, and
                 Thread-Level Speculation has been proposed to
                 dynamically extract parallelism in order to take
                 advantage of parallel hardware. In previous work, we
                 have showed significant speed-ups with a simple on/off
                 speculation heuristic. In this paper, we propose and
                 evaluate three heuristics for dynamically adapt the
                 speculation: a 2-bit heuristic, an exponential
                 heuristic, and a combination of these two. Our results
                 show that the combined heuristic is able to both
                 increase the number of successful speculations and
                 decrease the execution time for 15 popular web
                 applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Martinsen, JK (Reprint Author), Blekinge Inst Technol,
                 Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan
                 Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp,
                 SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony
                 Mobile Commun AB, SE-22188 Lund, Sweden.",
  author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se
                 Anders.Isberg@sonymobile.com",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Industrial Excellence Center EASE -
                 Embedded Applications Software Engineering; BESQ+
                 research project --- Knowledge Foundation in Sweden
                 [20100311]",
  funding-text = "This work was partly funded by the Industrial
                 Excellence Center EASE --- Embedded Applications
                 Software Engineering, (http://ease.cs.lth.se), and the
                 BESQ+ research project funded by the Knowledge
                 Foundation (grant number 20100311) in Sweden.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "2-bit heuristic; Automatic Parallelization; Benchmark
                 testing; C.1.4 Parallel Architectures; C.1.4.f
                 Speculative multi-threading; exponential heuristic;
                 Instruction sets; Internet; Java; JavaScript; Multicore
                 processors; Multithreading; Parallel Computing;
                 parallel hardware; Parallel processing; parallel
                 programming; sequential programming language; Social
                 network services; thread-level speculation; Web
                 applications",
  number-of-cited-references = "12",
  oa =           "Green Published",
  ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn,
                 Hakan/0000-0001-9947-1088",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Martinsen:2014:HTL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Morishima:2014:PEG,
  author =       "Shin Morishima and Hiroki Matsutani",
  title =        "Performance Evaluations of Graph Database using {CUDA}
                 and {OpenMP} Compatible Libraries",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "75--80",
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693728",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Graph databases use graph structures to store data
                 sets as nodes, edges, and properties. They are used to
                 store and search the relationships between a large
                 number of nodes, such as social networking services and
                 recommendation engines that use customer social graphs.
                 Since computation cost for graph search queries
                 increases as the graph becomes large, in this paper we
                 accelerate the graph search functions (Dijkstra and A*
                 algorithms) of a graph database Neo4j using two ways:
                 multithreaded library and CUDA library for graphics
                 processing units (GPUs). We use 100,000-node graphs
                 generated based on a degree distribution of Facebook
                 social graph for evaluations. Our multi-threaded and
                 GPU-based implementations require an auxiliary
                 adjacency matrix for a target graph. The results show
                 that, when we do not take into account additional
                 overhead to generate the auxiliary adjacency matrix,
                 multi-threaded version improves the Dijkstra and A*
                 search performance by 16.2x and 13.8x compared to the
                 original implementation. The GPU-based implementation
                 improves the Dijkstra and A* search performance by
                 26.2x and 32.8x. When we take into account the
                 overhead, although the speed-ups by our implementations
                 are reduced, by reusing the auxiliary adjacency matrix
                 for multiple graph search queries we can significantly
                 improve the graph search performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Mushtaq:2014:EHP,
  author =       "Hamid Mushtaq and Zaid Al-Ars and Koen Bertels",
  title =        "Efficient and highly portable deterministic
                 multithreading {(DetLock)}",
  journal =      j-COMPUTING,
  volume =       "96",
  number =       "12",
  pages =        "1131--1147",
  month =        dec,
  year =         "2014",
  CODEN =        "CMPTA2",
  DOI =          "https://doi.org/10.1007/s00607-013-0370-9",
  ISSN =         "0010-485X (print), 1436-5057 (electronic)",
  ISSN-L =       "0010-485X",
  bibdate =      "Wed Feb 11 07:42:26 MST 2015",
  bibsource =    "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=12;
                 https://www.math.utah.edu/pub/tex/bib/computing.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s00607-013-0370-9",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing",
  journal-URL =  "http://link.springer.com/journal/607",
}

@Article{Ngo:2014:EVC,
  author =       "Tri Minh Ngo and Mari{\"e}lle Stoelinga and Marieke
                 Huisman",
  title =        "Effective verification of confidentiality for
                 multi-threaded programs",
  journal =      j-J-COMP-SECUR,
  volume =       "22",
  number =       "2",
  pages =        "269--300",
  month =        "????",
  year =         "2014",
  CODEN =        "JCSIET",
  DOI =          "https://doi.org/10.3233/JCS-130492",
  ISSN =         "0926-227X (print), 1875-8924 (electronic)",
  ISSN-L =       "0926-227X",
  bibdate =      "Tue May 24 06:26:12 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computer Security",
  journal-URL =  "http://content.iospress.com/journals/journal-of-computer-security",
}

@Article{Niewiadomski:2014:SVG,
  author =       "Artur Niewiadomski and Jaroslaw Skaruz and Wojciech
                 Penczek and Maciej Szreter and Mariusz Jarocki",
  title =        "{SMT} Versus Genetic and {OpenOpt} Algorithms:
                 Concrete Planning in the {PlanICS} Framework",
  journal =      j-FUND-INFO,
  volume =       "135",
  number =       "4",
  pages =        "451--466",
  month =        oct,
  year =         "2014",
  CODEN =        "FUMAAJ",
  DOI =          "https://doi.org/10.3233/FI-2014-1134",
  ISSN =         "0169-2968 (print), 1875-8681 (electronic)",
  ISSN-L =       "0169-2968",
  bibdate =      "Sat Mar 5 17:20:06 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fundinfo2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Fundamenta Informaticae",
  journal-URL =  "http://content.iospress.com/journals/fundamenta-informaticae",
}

@Article{Niu:2014:MCF,
  author =       "Ben Niu and Gang Tan",
  title =        "Modular control-flow integrity",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "577--587",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594295",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Control-Flow Integrity (CFI) is a software-hardening
                 technique. It inlines checks into a program so that its
                 execution always follows a predetermined Control-Flow
                 Graph (CFG). As a result, CFI is effective at
                 preventing control-flow hijacking attacks. However,
                 past fine-grained CFI implementations do not support
                 separate compilation, which hinders its adoption. We
                 present Modular Control-Flow Integrity (MCFI), a new
                 CFI technique that supports separate compilation. MCFI
                 allows modules to be independently instrumented and
                 linked statically or dynamically. The combined module
                 enforces a CFG that is a combination of the individual
                 modules' CFGs. One challenge in supporting dynamic
                 linking in multithreaded code is how to ensure a safe
                 transition from the old CFG to the new CFG when
                 libraries are dynamically linked. The key technique we
                 use is to have the CFG represented in a runtime data
                 structure and have reads and updates of the data
                 structure wrapped in transactions to ensure thread
                 safety. Our evaluation on SPECCPU2006 benchmarks shows
                 that MCFI supports separate compilation, incurs low
                 overhead of around 5\%, and enhances security.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Odaira:2014:EGI,
  author =       "Rei Odaira and Jose G. Castanos and Hisanobu Tomari",
  title =        "Eliminating global interpreter locks in {Ruby} through
                 hardware transactional memory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "131--142",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555247",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many scripting languages use a Global Interpreter Lock
                 (GIL) to simplify the internal designs of their
                 interpreters, but this kind of lock severely lowers the
                 multi-thread performance on multi-core machines. This
                 paper presents our first results eliminating the GIL in
                 Ruby using Hardware Transactional Memory (HTM) in the
                 IBM zEnterprise EC12 and Intel 4th Generation Core
                 processors. Though prior prototypes replaced a GIL with
                 HTM, we tested realistic programs, the Ruby NAS
                 Parallel Benchmarks (NPB), the WEBrick HTTP server, and
                 Ruby on Rails. We devised a new technique to
                 dynamically adjust the transaction lengths on a
                 per-bytecode basis, so that we can optimize the
                 likelihood of transaction aborts against the relative
                 overhead of the instructions to begin and end the
                 transactions. Our results show that HTM achieved 1.9-
                 to 4.4-fold speedups in the NPB programs over the GIL
                 with 12 threads, and 1.6- and 1.2-fold speedups in
                 WEBrick and Ruby on Rails, respectively. The dynamic
                 transaction-length adjustment chose the best
                 transaction lengths for any number of threads and
                 applications with sufficiently long running times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Peternier:2014:IEU,
  author =       "Achille Peternier and Danilo Ansaloni and Daniele
                 Bonetta and Cesare Pautasso and Walter Binder",
  title =        "Improving execution unit occupancy on {SMT}-based
                 processors through hardware-aware thread scheduling",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "30",
  number =       "??",
  pages =        "229--241",
  month =        jan,
  year =         "2014",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Dec 2 16:57:46 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X13001295",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Petrovic:2014:LHM,
  author =       "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e}
                 Schiper",
  title =        "Leveraging hardware message passing for efficient
                 thread synchronization",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "143--154",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555251",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As the level of parallelism in manycore processors
                 keeps increasing, providing efficient mechanisms for
                 thread synchronization in concurrent programs is
                 becoming a major concern. On cache-coherent
                 shared-memory processors, synchronization efficiency is
                 ultimately limited by the performance of the underlying
                 cache coherence protocol. This paper studies how
                 hardware support for message passing can improve
                 synchronization performance. Considering the ubiquitous
                 problem of mutual exclusion, we adapt two
                 state-of-the-art solutions used on shared-memory
                 processors, namely the server approach and the
                 combining approach, to leverage the potential of
                 hardware message passing. We propose HybComb, a novel
                 combining algorithm that uses both message passing and
                 shared memory features of emerging hybrid processors.
                 We also introduce MP-Server, a straightforward
                 adaptation of the server approach to hardware message
                 passing. Evaluation on Tilera's TILE-Gx processor shows
                 that MP-Server can execute contended critical sections
                 with unprecedented throughput, as stalls related to
                 cache coherence are removed from the critical path.
                 HybComb can achieve comparable performance, while
                 avoiding the need to dedicate server cores.
                 Consequently, our queue and stack implementations,
                 based on MP-Server and HybComb, largely outperform
                 their most efficient pure-shared-memory counterparts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Pricopi:2014:TSA,
  author =       "M. Pricopi and T. Mitra",
  title =        "Task Scheduling on Adaptive Multi-Core",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "63",
  number =       "10",
  pages =        "2590--2603",
  month =        oct,
  year =         "2014",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2013.115",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Nov 06 07:29:34 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "adaptive architectures; adaptive multi-cores; adaptive
                 multicore architectures; core allocation; dynamic
                 heterogeneous multi-core; embedded domain;
                 general-purpose computing; ILP; instruction-level
                 parallelism; malleable and moldable tasks;
                 multi-threading; offline scheduler; on-chip cores;
                 online scheduler; parallel applications; parallel
                 architectures; power constraints; resource allocation;
                 resource allocation problems; Scheduling; scheduling;
                 sequential application; sequential code; sequential
                 fragments; task scheduling; thermal constraints;
                 thread-level parallelism; TLP",
}

@Article{Pusukuri:2014:LCA,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi
                 Narayan Bhuyan",
  title =        "Lock contention aware thread migrations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "369--370",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555273",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "On a cache-coherent multicore multiprocessor system,
                 the performance of a multithreaded application with
                 high lock contention is very sensitive to the
                 distribution of application threads across multiple
                 processors. This is because the distribution of threads
                 impacts the frequency of lock transfers between
                 processors, which in turn impacts the frequency of
                 last-level cache (LLC) misses that lie on the critical
                 path of execution. Inappropriate distribution of
                 threads across processors increases LLC misses in the
                 critical path and significantly degrades performance of
                 multithreaded programs. To alleviate the above problem,
                 this paper overviews a thread migration technique,
                 which migrates threads of a multithreaded program
                 across multicore processors so that threads seeking
                 locks are more likely to find the locks on the same
                 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Qian:2014:PRR,
  author =       "Xuehai Qian and Benjamin Sahelices and Depei Qian",
  title =        "{Pacifier}: record and replay for relaxed-consistency
                 multiprocessors with distributed directory protocol",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "433--444",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665736",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Record and Deterministic Replay (R\&R) of
                 multithreaded programs on relaxed-consistency
                 multiprocessors with distributed directory protocol has
                 been a long-standing open problem. The independently
                 developed RelaxReplay [8] solves the problem by
                 assuming write atomicity. This paper proposes Pacifier,
                 the first R\&R scheme to provide a solution without
                 assuming write atomicity. R\&R for relaxed-consistency
                 multiprocessors needs to detect, record and replay
                 Sequential Consistency Violations (SCV). Pacifier has
                 two key components: (i) Relog, a general memory
                 reordering logging and replay mechanism that can
                 reproduce SCVs in relaxed memory models, and (ii)
                 Granule, an SCV detection scheme in the record phase
                 with good precision, that indicates whether to record
                 with Relog. We show that Pacifier is a sweet spot in
                 the design space with a reasonable trade-off between
                 hardware and log overhead. An evaluation with
                 simulations of 16, 32 and 64 processors with Release
                 Consistency (RC) running SPLASH-2 applications
                 indicates that Pacifier incurs 3.9\% ~ 16\% larger
                 logs. The slowdown of Pacifier during replay is 10.1\%
                 ~ 30.5\% compared to native execution",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Rahman:2014:CCO,
  author =       "Musfiq Rahman and Bruce R. Childers and Sangyeun Cho",
  title =        "{COMeT+}: Continuous Online Memory Testing with
                 Multi-Threading Extension",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "63",
  number =       "7",
  pages =        "1668--1681",
  month =        jul,
  year =         "2014",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2013.65",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon Aug 25 08:24:32 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Ribic:2014:EEW,
  author =       "Haris Ribic and Yu David Liu",
  title =        "Energy-efficient work-stealing language runtimes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "513--528",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541971",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:47 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Work stealing is a promising approach to constructing
                 multithreaded program runtimes of parallel programming
                 languages. This paper presents HERMES, an
                 energy-efficient work-stealing language runtime. The
                 key insight is that threads in a work-stealing
                 environment --- thieves and victims --- have varying
                 impacts on the overall program running time, and a
                 coordination of their execution ``tempo'' can lead to
                 energy efficiency with minimal performance loss. The
                 centerpiece of HERMES is two complementary algorithms
                 to coordinate thread tempo: the workpath-sensitive
                 algorithm determines tempo for each thread based on
                 thief-victim relationships on the execution path,
                 whereas the workload-sensitive algorithm selects
                 appropriate tempo based on the size of work-stealing
                 deques. We construct HERMES on top of Intel Cilk Plus's
                 runtime, and implement tempo adjustment through
                 standard Dynamic Voltage and Frequency Scaling (DVFS).
                 Benchmarks running on HERMES demonstrate an average of
                 11-12\% energy savings with an average of 3-4\%
                 performance loss through meter-based measurements over
                 commercial CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Rogers:2014:LYL,
  author =       "Timothy G. Rogers and Mike O'Connor and Tor M.
                 Aamodt",
  title =        "Learning your limit: managing massively multithreaded
                 caches through scheduling",
  journal =      j-CACM,
  volume =       "57",
  number =       "12",
  pages =        "91--98",
  month =        dec,
  year =         "2014",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/2682583",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Thu Jan 22 08:42:40 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/cacm/;
                 https://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://cacm.acm.org/magazines/2014/12/180789/fulltext",
  abstract =     "The gap between processor and memory performance has
                 become a focal point for microprocessor research and
                 development over the past three decades. Modern
                 architectures use two orthogonal approaches to help
                 alleviate this issue: (1) Almost every microprocessor
                 includes some form of on-chip storage, usually in the
                 form of caches, to decrease memory latency and make
                 more effective use of limited memory bandwidth. (2)
                 Massively multithreaded architectures, such as graphics
                 processing units (GPUs), attempt to hide the high
                 latency to memory by rapidly switching between many
                 threads directly in hardware. This paper explores the
                 intersection of these two techniques. We study the
                 effect of accelerating highly parallel workloads with
                 significant locality on a massively multithreaded GPU.
                 We observe that the memory access stream seen by
                 on-chip caches is the direct result of decisions made
                 by the hardware thread scheduler. Our work proposes a
                 hardware scheduling technique that reacts to feedback
                 from the memory system to create a more cache-friendly
                 access stream. We evaluate our technique using
                 simulations and show a significant performance
                 improvement over previously proposed scheduling
                 mechanisms. We demonstrate the effectiveness of
                 scheduling as a cache management technique by comparing
                 cache hit rate using our scheduler and an LRU
                 replacement policy against other scheduling techniques
                 using an optimal cache replacement policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Samak:2014:MTS,
  author =       "Malavika Samak and Murali Krishna Ramanathan",
  title =        "Multithreaded test synthesis for deadlock detection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "473--489",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660238",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Designing and implementing thread-safe multithreaded
                 libraries can be a daunting task as developers of these
                 libraries need to ensure that their implementations are
                 free from concurrency bugs, including deadlocks. The
                 usual practice involves employing software testing
                 and/or dynamic analysis to detect deadlocks. Their
                 effectiveness is dependent on well-designed
                 multithreaded test cases. Unsurprisingly, developing
                 multithreaded tests is significantly harder than
                 developing sequential tests for obvious reasons. In
                 this paper, we address the problem of automatically
                 synthesizing multithreaded tests that can induce
                 deadlocks. The key insight to our approach is that a
                 subset of the properties observed when a deadlock
                 manifests in a concurrent execution can also be
                 observed in a single threaded execution. We design a
                 novel, automatic, scalable and directed approach that
                 identifies these properties and synthesizes a deadlock
                 revealing multithreaded test. The input to our approach
                 is the library implementation under consideration and
                 the output is a set of deadlock revealing multithreaded
                 tests. We have implemented our approach as part of a
                 tool, named OMEN$^1$. OMEN is able to synthesize
                 multithreaded tests on many multithreaded Java
                 libraries. Applying a dynamic deadlock detector on the
                 execution of the synthesized tests results in the
                 detection of a number of deadlocks, including 35 real
                 deadlocks in classes documented as thread-safe.
                 Moreover, our experimental results show that dynamic
                 analysis on multithreaded tests that are either
                 synthesized randomly or developed by third-party
                 programmers are ineffective in detecting the
                 deadlocks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Book{Schildt:2014:JCR,
  editor =       "Herbert Schildt",
  title =        "{Java}: The Complete Reference",
  publisher =    pub-MCGRAW-HILL,
  address =      pub-MCGRAW-HILL:adr,
  edition =      "Ninth",
  pages =        "xxxiv + 1274",
  year =         "2014",
  ISBN =         "0-07-180855-8 (paperback), 0-07-180925-2,
                 0-07-180856-6",
  ISBN-13 =      "978-0-07-180855-2, 978-0-07-180925-2,
                 978-0-07-180856-9",
  LCCN =         "QA76.73.J38 S332 2014eb",
  bibdate =      "Thu Dec 4 13:05:57 MST 2014",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Fully updated for Java SE 8, this edition explains how
                 to develop, compile, debug, and run Java programs. The
                 book covers the entire Java language, including its
                 syntax, keywords, and fundamental programming
                 principles, as well as significant portions of the Java
                 API library. JavaBeans, servlets, applets, and Swing
                 are examined and real-world examples demonstrate Java
                 in action. New Java SE 8 features such as lambda
                 expressions, the stream library, and the default
                 interface method are discussed in detail. This Oracle
                 Press resource also offers a solid introduction to
                 JavaFX. Topics covered include: data types, variables,
                 arrays, and operators; control statements; classes,
                 objects, and methods; method overloading and
                 overriding; inheritance; interfaces and packages;
                 exception handling; multithreaded programming;
                 enumerations, autoboxing, and annotations; I/O classes;
                 generics; lambda expressions; string handlin;
                 collections framework; networking; event handling; AWT
                 and Swing; concurrent and stream API; regular
                 expressions; JavaFX; JavaBeans; and applets and
                 servlets.",
  acknowledgement = ack-nhfb,
  shorttableofcontents = "The history and evolution of Java \\
                 An overview of Java \\
                 Data types, variables, and arrays \\
                 Operators \\
                 Control statements \\
                 Introducing classes \\
                 A closer look at methods and classes \\
                 Inheritance \\
                 Packages and interfaces \\
                 Exception handling \\
                 Multithreaded programming \\
                 Enumerations, autoboxing, and annotations (metadata)
                 \\
                 I/O, applets, and other topics \\
                 Generics \\
                 Lambda expressions \\
                 String handling \\
                 Exploring java.lang \\
                 Java.util part 1: the collections framework \\
                 Java.util part 2: more utility classes \\
                 Input/output: exploring java.io \\
                 Exploring NIO \\
                 Networking \\
                 The applet class \\
                 Event handling \\
                 Introducing the AWT: working with windows, graphics,
                 and text \\
                 Using AWT controls, layout managers, and menus \\
                 Images \\
                 The concurrency utilities \\
                 The stream API \\
                 Regular expressions and other packages \\
                 Introducing swing \\
                 Exploring swing \\
                 Introducing swing menus \\
                 Introducing JavaFX GUI programming \\
                 Exploring JavaFX controls \\
                 Introducing JavaFX menus \\
                 Java beans \\
                 Introducing servlets \\
                 Using Java's documentation comments",
  subject =      "Java (Langage de programmation); Programmation
                 Internet; Java (Computer program language); Internet
                 programming; Internet programming.; Java (Computer
                 program language)",
  tableofcontents = "Part I. The Java language \\
                 1. The history and evolution of Java: Java's lineage;
                 The creation of Java; How Java changed the Internet;
                 Java's magic: the bytecode; Servlets: Java on the
                 server side; The Java buzzwords; The evolution of Java;
                 Java SE 8; A culture of innovation \\
                 2. An overview of Java: Object-oriented programming; A
                 first simple program; A second short program; Two
                 control statements; Using blocks of code; Lexical
                 issues; The Java class libraries \\
                 3. Data types, variables, and arrays: Java is a
                 strongly typed language; The primitive types; Integers;
                 Floating-point types; Characters; Booleans; A closer
                 look at literals; Variables; Type conversion and
                 casting; Automatic type promotion in expressions;
                 Arrays; A few words about strings; A note to C/C++
                 programmers about pointers \\
                 4. Operators: Arithmetic operators; The bitwise
                 operators; Relational operators; Boolean logical
                 operators; The assignment operator; The ? operator;
                 Operator precedence; Using parentheses \\
                 5. Control statements: Java's selection statements;
                 Iteration statements; Jump statements \\
                 6. Introducing classes: Class fundamentals; Declaring
                 objects; Assigning object reference variables;
                 Introducing methods; Constructors; The this keyword;
                 Garbage collection; The finalize() method; A stack
                 class \\
                 7. A closer look at methods and classes: Overloading
                 methods; Using objects as parameters; A closer look at
                 argument passing; Returning objects; Recursion;
                 Introducing access control; Understanding static;
                 Introducing final; Arrays revisited; Introducing nested
                 and inner classes; Exploring the string class; Using
                 command-line arguments; Varargs: variable-length
                 arguments \\
                 8. Inheritance: Inheritance basics; Using super;
                 Creating a multilevel hierarchy; When constructors are
                 executed; Method overriding; Dynamic method dispatch;
                 Using abstract classes; Using final with inheritance;
                 The object class \\
                 9. Packages and interfaces: Packages; Access
                 protection; Importing packages; Interfaces; Default
                 interface methods; Use static methods in an interface;
                 Final thoughts on packages and interfaces \\
                 10. Exception handling: Exception-handling
                 fundamentals; Exception types; Uncaught exceptions;
                 Using try and catch; Multiple catch clauses; Nested try
                 statements; Throw; Throws; Finally; Java's build-in
                 exceptions; Creating your own exception subclasses;
                 Chained exceptions; Three recently added exception
                 features; Using exceptions \\
                 11. Multithreaded programming: The Java thread model;
                 The main thread; Creating a thread; Creating multiple
                 threads; Using isAlive() and join(); Thread priorities;
                 Synchronization; Interthread communication; Suspending,
                 resuming, and stopping threads; Obtaining a thread's
                 state; Using multithreading \\
                 12. Enumerations, autoboxing, and annotations
                 (metadata): Enumerations; Type wrappers; Autoboxing;
                 Annotations (metadata); Type annotations; Repeating
                 annotations \\
                 13. I/O, applets, and other topics: I/O basics; Reading
                 console input; Writing console output; The PrintWriter
                 class; Reading and writing files; Automatically closing
                 a file; Applet fundamentals; The transient and volatile
                 modifiers; Using instanceof; Strictfp; Native methods;
                 Problems with native methods; Using assert; Static
                 import; Invoking overloaded constructors through
                 this(); Compact API profiles \\
                 14. Generics: What are generics?; A simple generics
                 example; A generic class with two type parameters; The
                 general form of a generic class; Bounded types; Using
                 wildcard arguments; Creating a generic method; Generic
                 interfaces; Raw types and legacy code; Generic class
                 hierarchies; Type inference with generics; Erasure;
                 Ambiguity errors; Some generic restrictions \\
                 15. Lambda expressions: Introducing lambda expressions;
                 Block lambda expressions; Generic functional
                 interfaces; Passing lambda expressions as arguments;
                 Lambda expressions and exceptions; Lambda expressions
                 and variable capture; Method references; Constructor
                 references; Predefined functional interfaces \\
                 Part II. The Java library. \\
                 16. String handling: The string constructors; String
                 length; Special string operations; Character
                 extraction; String comparison; Searching strings;
                 Modifying a string; Data conversion using valueOf();
                 Changing the case of characters within a string;
                 Joining strings; Additional string methods;
                 StringBuffer; StringBuilder \\
                 17. Exploring java.lang: Primitive type wrappers; Void;
                 Process; Runtime; ProcessBuilder; System; Object; Using
                 clone() and the cloneable interface; Class;
                 ClassLoader; Math; StrictMath; Compiler; Thread,
                 ThreadGroup and runnable; ThreadLocal and
                 InheritableThreadLocal; Package; RuntimePermission;
                 Throwable; SecurityManager; StackTraceElement; Enum;
                 ClassValue; The CharSequence interface; The comparable
                 interface; The appendable interface; The iterable
                 interface; The readable interface; The AutoCloseable
                 interface; The Thread.UncaughtExceptionHandler
                 interface; The java.lang subpackages \\
                 18. java.util Part 1: The collections framework:
                 Collections overview; JDK 5 changed the collections
                 framework; The collection interfaces; The collection
                 classes; Accessing a collection via an iterator;
                 Spliterators; Storing user-defined classes in
                 collections; The RandomAccess interface; Working with
                 maps; Comparators; The collection algorithms; Arrays;
                 The legacy classes and interfaces; Parting thoughts on
                 collections \\
                 19. java.util Part 2: More utility classes:
                 StringTokenizer; BitSet; Optional, OptionalDouble,
                 OptionalInt, and OptionalLong; Date; Calendar;
                 GregorianCalendar; TimeZone; SimpleTimeZone; Locale;
                 Random; Observable; Timer and TimerTask; Currency;
                 Formatter; Scanner; The ResourceBundle,
                 ListResourceBundle, and PropertyResourceBundle classes;
                 Miscellaneous utility classes and interfaces; The
                 java.util subpackages \\
                 20. Input/output: exploring java.io: The I/O classes
                 and interfaces; File; The AutoCloseable, Closeable, and
                 flushable interfaces; I/O exceptions; Two ways to close
                 a stream; The stream classes; The byte streams; The
                 character streams; The console class; Serialization;
                 Stream benefits \\
                 21. Exploring NIO: The NIO classes; NIO fundamentals;
                 Enhancements added to NIO by JDK 7; Using the NIO
                 system; Pre-JDK 7 channel-based examples \\
                 22. Networking: Networking basics; The networking
                 classes and interfaces; Inet/Address; Inet4Address and
                 Inet6Address; TCP/IP client sockets; URL;
                 URLConnection; HttpURLConnection; The URI class;
                 Cookies; TCP/IP server sockets; Datagrams \\
                 23. The applet class: Two types of applets; Applet
                 basics; Applet architecture; An applet skeleton; Simple
                 applet display methods; Requesting repainting; Using
                 the status window; The HTML APPLET tag; Passing
                 parameters to applets; getDocumentBase() and
                 getCodeBase(); AppletContext and showDocument(); The
                 AudioClip interface; The AppletStub interface;
                 Outputting to the console \\
                 24. Event handling: Two event handling mechanisms; The
                 delegation event model; Event classes; The KeyEvent
                 class; Sources of events; Event listener interfaces;
                 Using the delegation event model; Adapter classes;
                 Inner classes \\
                 25. Introducing the AWT: working with windows,
                 graphics, and text: AWT classes; Window fundamentals;
                 Working with frame windows; Creating a frame window in
                 an AWT-based applet; Creating a windowed program;
                 Displaying information within a window; Introducing
                 graphics; Working with color; Setting the paint mode;
                 Working with fonts; Managing text output using
                 FontMetrics \\
                 26. Using AWT controls, layout managers, and menus: AWT
                 control fundamentals; Labels; Using buttons; Applying
                 check boxes; CheckboxGroup; Choice controls; Using
                 lists; Managing scroll bars; Using a TextField; Using a
                 TextArea; Understanding layout managers; Menu bars and
                 menus; Dialog boxes; FileDialog; A word about
                 overriding paint() \\
                 27. Images: File formats; Image fundamentals: creating,
                 loading, and displaying; ImageObserver; Double
                 buffering; MediaTracker; ImageProducer; ImageConsumer;
                 ImageFilter; Additional imaging classes \\
                 28. The concurrency utilities: The concurrent API
                 packages; Using synchronization objects; Phaser; Using
                 an executor; The TimeUnit enumeration; the concurrent
                 collections; Locks; Atomic operations; Parallel
                 programming via the fork/join framework; The
                 concurrency utilities versus Java's traditional
                 approach \\
                 29. The stream API: Stream basics; Reduction
                 operations; Using parallel streams; Mapping;
                 Collecting; Iterators and streams; More to explore in
                 the stream API \\
                 30. Regular expressions and other packages: The core
                 Java API packages; Regular expression processing;
                 Reflection; Remote method invocation (RMI); Formatting
                 date and time with java.text; The time and date API
                 added by JDK 8 \\
                 Part III. Introducing GUI programming with swing \\
                 31. Introducing swing: The origins of swing; Swing is
                 built on the AWT; Two key swing features; The MVC
                 connection; Components and containers; The swing
                 packages; A simple swing application; Event handling;
                 Create a swing applet; Painting in swing \\
                 32. Exploring swing: JLabel and ImageIcon; JTextField;
                 The swing buttons; JTabbedPane; JScrollPane; JList;
                 JComboBox; Trees; JTable \\
                 33. Introducing swing menus: Menu basics; An overview
                 of JMenuBar, JMenu, and JMenuItem; Create a main menu;
                 Add Mnemonics and accelerators to menu items; Add
                 images and tooltips to menu items; Use
                 JRadioButtonMenuItem and JCheckBoxMenuItem; Create a
                 popup menu; Create a toolbar; Use actions; Put the
                 entire MenuDemo program together; Continuing your
                 exploration of swing \\
                 Part IV. Introducing GUI programming with JavaFX \\
                 34. Introducing JavaFX GUI programming: JavaFX basic
                 concepts; A JavaFX application skeleton; Compiling and
                 running a JavaFX program; The application thread; A
                 simple JavaFX control: label; Using buttons and events;
                 Drawing directly on a canvas \\
                 35. Exploring JavaFX controls: Using image and
                 ImageView; ToggleButton; RadioButton; CheckBox;
                 ListView; ComboBox; TextField; ScrollPane; TreeView;
                 Introducing effects and transforms; Adding tooltips;
                 Disabling a control \\
                 36. Introducing JavaFX menus: Menu basics; An overview
                 of MenuBar, Menu, and MenuItem; Create a main menu; Add
                 mnemonics and accelerators to menu items; Add images to
                 menu items; Use RadioMenuItem and CheckMenuItem; Create
                 a context menu; Create a toolbar; Put the entire
                 MenuDemo program together; Continuing your exploration
                 of JavaFX \\
                 Part V. Applying Java \\
                 37. Java beans: What is a Java bean?; Advantages of
                 Java beans; Introspection; Bound and constrained
                 properties; Persistence; Customizers; The Java beans
                 API; A bean example \\
                 38. Introducing servlets: Background; The life cycle of
                 a servlet; Servlet development options; Using Tomcat; A
                 simple servlet; The servlet API; The javax.servlet
                 package; Reading servlet parameters; The
                 javax.servlet.http package; Handling HTTP requests and
                 responses; Using cookies; Session tracking \\
                 Appendix. Using Java's documentation comments: The
                 javadoc tags; The general form of a documentation
                 comment; What javadoc outputs; An example that uses
                 documentation comments",
}

@Article{Shih:2014:COR,
  author =       "Wen-Li Shih and Yi-Ping You and Chung-Wen Huang and
                 Jenq Kuen Lee",
  title =        "Compiler Optimization for Reducing Leakage Power in
                 Multithread {BSP} Programs",
  journal =      j-TODAES,
  volume =       "20",
  number =       "1",
  pages =        "9:1--9:??",
  month =        nov,
  year =         "2014",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/2668119",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Wed Nov 19 11:18:40 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/todaes.bib",
  abstract =     "Multithread programming is widely adopted in novel
                 embedded system applications due to its high
                 performance and flexibility. This article addresses
                 compiler optimization for reducing the power
                 consumption of multithread programs. A traditional
                 compiler employs energy management techniques that
                 analyze component usage in control-flow graphs with a
                 focus on single-thread programs. In this environment
                 the leakage power can be controlled by inserting on and
                 off instructions based on component usage information
                 generated by flow equations. However, these methods
                 cannot be directly extended to a multithread
                 environment due to concurrent execution issues. This
                 article presents a multithread power-gating framework
                 composed of multithread power-gating analysis (MTPGA)
                 and predicated power-gating (PPG) energy management
                 mechanisms for reducing the leakage power when
                 executing multithread programs on simultaneous
                 multithreading (SMT) machines. Our multithread
                 programming model is based on hierarchical
                 bulk-synchronous parallel (BSP) models. Based on a
                 multithread component analysis with dataflow equations,
                 our MTPGA framework estimates the energy usage of
                 multithread programs and inserts PPG operations as
                 power controls for energy management. We performed
                 experiments by incorporating our power optimization
                 framework into SUIF compiler tools and by simulating
                 the energy consumption with a post-estimated SMT
                 simulator based on Wattch toolkits. The experimental
                 results show that the total energy consumption of a
                 system with PPG support and our power optimization
                 method is reduced by an average of 10.09\% for BSP
                 programs relative to a system without a power-gating
                 mechanism on leakage contribution set to 30\%; and the
                 total energy consumption is reduced by an average of
                 4.27\% on leakage contribution set to 10\%. The results
                 demonstrate our mechanisms are effective in reducing
                 the leakage energy of BSP multithread programs.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Sridharan:2014:AEP,
  author =       "Srinath Sridharan and Gagan Gupta and Gurindar S.
                 Sohi",
  title =        "Adaptive, efficient, parallel execution of parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "169--180",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594292",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Future multicore processors will be heterogeneous, be
                 increasingly less reliable, and operate in dynamically
                 changing operating conditions. Such environments will
                 result in a constantly varying pool of hardware
                 resources which can greatly complicate the task of
                 efficiently exposing a program's parallelism onto these
                 resources. Coupled with this uncertainty is the diverse
                 set of efficiency metrics that users may desire. This
                 paper proposes Varuna, a system that dynamically,
                 continuously, rapidly and transparently adapts a
                 program's parallelism to best match the instantaneous
                 capabilities of the hardware resources while satisfying
                 different efficiency metrics. Varuna is applicable to
                 both multithreaded and task-based programs and can be
                 seamlessly inserted between the program and the
                 operating system without needing to change the source
                 code of either. We demonstrate Varuna's effectiveness
                 in diverse execution environments using unaltered C/C++
                 parallel programs from various benchmark suites.
                 Regardless of the execution environment, Varuna always
                 outperformed the state-of-the-art approaches for the
                 efficiency metrics considered.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Steele:2014:FSP,
  author =       "Guy L. {Steele, Jr.} and Doug Lea and Christine H.
                 Flood",
  title =        "Fast splittable pseudorandom number generators",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "453--472",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660195",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/m/marsaglia-george.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jstatsoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/mathcw.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomacs.bib",
  abstract =     "We describe a new algorithm SplitMix for an
                 object-oriented and splittable pseudorandom number
                 generator (PRNG) that is quite fast: 9 64-bit
                 arithmetic/logical operations per 64 bits generated. A
                 conventional linear PRNG object provides a generate
                 method that returns one pseudorandom value and updates
                 the state of the PRNG, but a splittable PRNG object
                 also has a second operation, split, that replaces the
                 original PRNG object with two (seemingly) independent
                 PRNG objects, by creating and returning a new such
                 object and updating the state of the original object.
                 Splittable PRNG objects make it easy to organize the
                 use of pseudorandom numbers in multithreaded programs
                 structured using fork-join parallelism. No locking or
                 synchronization is required (other than the usual
                 memory fence immediately after object creation).
                 Because the generate method has no loops or
                 conditionals, it is suitable for SIMD or GPU
                 implementation. We derive SplitMix from the DotMix
                 algorithm of Leiserson, Schardl, and Sukha by making a
                 series of program transformations and engineering
                 improvements. The end result is an object-oriented
                 version of the purely functional API used in the
                 Haskell library for over a decade, but SplitMix is
                 faster and produces pseudorandom sequences of higher
                 quality; it is also far superior in quality and speed
                 to java.util.Random, and has been included in Java JDK8
                 as the class java.util.SplittableRandom. We have tested
                 the pseudorandom sequences produced by SplitMix using
                 two standard statistical test suites (DieHarder and
                 TestU01) and they appear to be adequate for
                 ``everyday'' use, such as in Monte Carlo algorithms and
                 randomized data structures where speed is important.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark-1 =     "OOPSLA '14 conference proceedings.",
  remark-2 =     "On page 466, the authors describe an interesting
                 technique for improving a user-supplied seed that might
                 produce insufficient randomness in the next several
                 members of the random-number sequence: ``Long runs of
                 0-bits or of 1-bits in the $\gamma$ [candidate seed]
                 value do not cause bits of the seed to flip; an
                 approximate proxy for how many bits of the seed will
                 flip might be the number of bit pairs of the form 01 or
                 10 in the candidate $\gamma$ value {\tt z}. Therefore
                 we require that the number of such pairs, as computed
                 by {\tt Long.bitCount(z ^ (z >>> 1))}, exceed 24; if it
                 does not, then the candidate z is replaced by the XOR
                 of {\tt z} and {\tt 0xaaaaaaaaaaaaaaaaL}, a constant
                 chosen so that (a) the low bit of {\tt z} remains 1,
                 and (b) every bit pair of the form 00 or 11 becomes
                 either 01 or 10, and likewise every bit pair of the
                 form 01 or 10 becomes either 00 or 11, so the new value
                 necessarily has more than 24 bit pairs whose bits
                 differ. Testing shows that this trick appears to be
                 effective.''",
  remark-3 =     "From page 468: ``we did three runs of TestU01 BigCrush
                 on {\tt java.util.Random}; 19 tests produced clear
                 failure on all three runs. These included 9 Birthday
                 Spacings tests, 8 ClosePairs tests, a WeightDistrib
                 test, and a CouponCollector test. This confirms
                 L'Ecuyer's observation that {\tt java.util.Random}
                 tends to fail Birthday Spacings tests [17].'' The
                 reference is to \cite{LEcuyer:2001:SUR}.",
  remark-4 =     "From page 470: ``[L'Ecuyer] comments, `In the Java
                 class {\tt java.util.Random}, RNG streams can be
                 declared and constructed dynamically, without limit on
                 their number. However, no precaution seems to have been
                 taken regarding the independence of these streams.'''",
  remark-5 =     "From page 471: ``They [the generators in this paper]
                 should not be used for cryptographic or security
                 applications, because they are too predictable (the
                 mixing functions are easily inverted, and two
                 successive outputs suffice to reconstruct the internal
                 state), \ldots{} One version seems especially suitable
                 for use as a replacement for {\tt java.util.Random},
                 because it produces sequences of higher quality, is
                 faster in sequential use, is easily parallelized for
                 use in JDK8 stream expressions, and is amenable to
                 efficient implementation on SIMD and GPU
                 architectures.''",
}

@Article{Sung:2014:PTR,
  author =       "I-Jui Sung and Juan G{\'o}mez-Luna and Jos{\'e}
                 Mar{\'\i}a Gonz{\'a}lez-Linares and Nicol{\'a}s Guil
                 and Wen-Mei W. Hwu",
  title =        "In-place transposition of rectangular matrices on
                 accelerators",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "207--218",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555266",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Matrix transposition is an important algorithmic
                 building block for many numeric algorithms such as FFT.
                 It has also been used to convert the storage layout of
                 arrays. With more and more algebra libraries offloaded
                 to GPUs, a high performance in-place transposition
                 becomes necessary. Intuitively, in-place transposition
                 should be a good fit for GPU architectures due to
                 limited available on-board memory capacity and high
                 throughput. However, direct application of CPU in-place
                 transposition algorithms lacks the amount of
                 parallelism and locality required by GPUs to achieve
                 good performance. In this paper we present the first
                 known in-place matrix transposition approach for the
                 GPUs. Our implementation is based on a novel 3-stage
                 transposition algorithm where each stage is performed
                 using an elementary tiled-wise transposition.
                 Additionally, when transposition is done as part of the
                 memory transfer between GPU and host, our staged
                 approach allows hiding transposition overhead by
                 overlap with PCIe transfer. We show that the 3-stage
                 algorithm allows larger tiles and achieves 3X speedup
                 over a traditional 4-stage algorithm, with both
                 algorithms based on our high-performance elementary
                 transpositions on the GPU. We also show our proposed
                 low-level optimizations improve the sustained
                 throughput to more than 20 GB/s. Finally, we propose an
                 asynchronous execution scheme that allows CPU threads
                 to delegate in-place matrix transposition to GPU,
                 achieving a throughput of more than 3.4 GB/s (including
                 data transfers costs), and improving current
                 multithreaded implementations of in-place transposition
                 on CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Tarvo:2014:AAM,
  author =       "Alexander Tarvo and Steven P. Reiss",
  title =        "Automated analysis of multithreaded programs for
                 performance modeling",
  journal =      j-SIGMETRICS,
  volume =       "42",
  number =       "1",
  pages =        "557--558",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637364.2592016",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Jun 27 06:38:48 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "We present an approach for building performance models
                 of multithreaded programs automatically. We use a
                 combination of static and a dynamic analyses of a
                 single representative run of the program to build its
                 model. The model can predict performance of the program
                 under a variety of configurations. This paper outlines
                 how we construct the model and demonstrates how the
                 resultant models accurately predict the performance and
                 resource utilization of complex multithreaded
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Turon:2014:GNW,
  author =       "Aaron Turon and Viktor Vafeiadis and Derek Dreyer",
  title =        "{GPS}: navigating weak memory with ghosts, protocols,
                 and separation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "691--707",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660243",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Weak memory models formalize the inconsistent
                 behaviors that one can expect to observe in
                 multithreaded programs running on modern hardware. In
                 so doing, however, they complicate the
                 already-difficult task of reasoning about correctness
                 of concurrent code. Worse, they render impotent the
                 sophisticated formal methods that have been developed
                 to tame concurrency, which almost universally assume a
                 strong ( i.e. sequentially consistent) memory model.
                 This paper introduces GPS, the first program logic to
                 provide a full-fledged suite of modern verification
                 techniques --- including ghost state, protocols, and
                 separation logic --- for high-level, structured
                 reasoning about weak memory. We demonstrate the
                 effectiveness of GPS by applying it to challenging
                 examples drawn from the Linux kernel as well as
                 lock-free data structures. We also define the semantics
                 of GPS and prove in Coq that it is sound with respect
                 to the axiomatic C11 weak memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Wadden:2014:RWD,
  author =       "Jack Wadden and Alexander Lyashevsky and Sudhanva
                 Gurumurthi and Vilas Sridharan and Kevin Skadron",
  title =        "Real-world design and evaluation of compiler-managed
                 {GPU} redundant multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "73--84",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665686",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Reliability for general purpose processing on the GPU
                 (GPGPU) is becoming a weak link in the construction of
                 reliable supercomputer systems. Because hardware
                 protection is expensive to develop, requires dedicated
                 on-chip resources, and is not portable across different
                 architectures, the efficiency of software solutions
                 such as redundant multithreading (RMT) must be
                 explored. This paper presents a real-world design and
                 evaluation of automatic software RMT on GPU hardware.
                 We first describe a compiler pass that automatically
                 converts GPGPU kernels into redundantly threaded
                 versions. We then perform detailed power and
                 performance evaluations of three RMT algorithms, each
                 of which provides fault coverage to a set of structures
                 in the GPU. Using real hardware, we show that
                 compiler-managed software RMT has highly variable
                 costs. We further analyze the individual costs of
                 redundant work scheduling, redundant computation, and
                 inter-thread communication, showing that no single
                 component in general is responsible for high overheads
                 across all applications; instead, certain workload
                 properties tend to cause RMT to perform well or poorly.
                 Finally, we demonstrate the benefit of architectural
                 support for RMT with a specific example of fast,
                 register-level thread communication",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Xu:2014:STM,
  author =       "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao
                 Li and Depei Qian",
  title =        "Software Transactional Memory for {GPU}
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "49--52",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To make applications with dynamic data sharing among
                 threads benefit from GPU acceleration, we propose a
                 novel software transactional memory system for GPU
                 architectures (GPU-STM). The major challenges include
                 ensuring good scalability with respect to the massively
                 multithreading of GPUs, and preventing livelocks caused
                 by the SIMT execution paradigm of GPUs. To this end, we
                 propose (1) a hierarchical validation technique and (2)
                 an encounter-time lock-sorting mechanism to deal with
                 the two challenges, respectively. Evaluation shows that
                 GPU-STM outperforms coarse-grain locks on GPUs by up to
                 20x.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp
                 Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li,
                 Tao, Univ Florida, ECE Dept, Gainesville, FL USA.",
  author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn
                 nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF of China [61133004, 61128004,
                 61073011]; 863 Program of China [2012AA010902]",
  funding-text = "This work is supported by NSF of China under grant
                 61133004, 61128004 and 61073011, and 863 Program of
                 China under grant 2012AA010902.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Multicore Processors; Parallel Programming; Run-time
                 Environments; SIMD Processors",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Xu:2014:STM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yang:2014:CNR,
  author =       "Yi Yang and Huiyang Zhou",
  title =        "{CUDA-NP}: realizing nested thread-level parallelism
                 in {GPGPU} applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "93--106",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555254",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallel programs consist of series of code sections
                 with different thread-level parallelism (TLP). As a
                 result, it is rather common that a thread in a parallel
                 program, such as a GPU kernel in CUDA programs, still
                 contains both sequential code and parallel loops. In
                 order to leverage such parallel loops, the latest
                 Nvidia Kepler architecture introduces dynamic
                 parallelism, which allows a GPU thread to start another
                 GPU kernel, thereby reducing the overhead of launching
                 kernels from a CPU. However, with dynamic parallelism,
                 a parent thread can only communicate with its child
                 threads through global memory and the overhead of
                 launching GPU kernels is non-trivial even within GPUs.
                 In this paper, we first study a set of GPGPU benchmarks
                 that contain parallel loops, and highlight that these
                 bench-marks do not have a very high loop count or high
                 degrees of TLP. Consequently, the benefits of
                 leveraging such parallel loops using dynamic
                 parallelism are too limited to offset its overhead. We
                 then present our proposed solution to exploit nested
                 parallelism in CUDA, referred to as CUDA-NP. With
                 CUDA-NP, we initially enable a high number of threads
                 when a GPU program starts, and use control flow to
                 activate different numbers of threads for different
                 code sections. We implemented our proposed CUDA-NP
                 framework using a directive-based compiler approach.
                 For a GPU kernel, an application developer only needs
                 to add OpenMP-like pragmas for parallelizable code
                 sections. Then, our CUDA-NP compiler automatically
                 generates the optimized GPU kernels. It supports both
                 the reduction and the scan primitives, explores
                 different ways to distribute parallel loop iterations
                 into threads, and efficiently manages on-chip resource.
                 Our experiments show that for a set of GPGPU
                 benchmarks, which have already been optimized and
                 contain nested parallelism, our pro-posed CUDA-NP
                 framework further improves the performance by up to
                 6.69 times and 2.18 times on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Yang:2014:MPP,
  author =       "Junfeng Yang and Heming Cui and Jingyue Wu and Yang
                 Tang and Gang Hu",
  title =        "Making parallel programs reliable with stable
                 multithreading",
  journal =      j-CACM,
  volume =       "57",
  number =       "3",
  pages =        "58--69",
  month =        mar,
  year =         "2014",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/2500875",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Thu Feb 27 17:17:45 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/cacm/;
                 https://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Stable multithreading dramatically simplifies the
                 interleaving behaviors of parallel programs, offering
                 new hope for making parallel programming easier.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Aliaga:2015:CMS,
  author =       "Jos{\'e} I. Aliaga and Jos{\'e} M. Bad{\'\i}a and
                 Maribel Castillo and Davor Davidovi{\'c} and Rafael
                 Mayo and Enrique S. Quintana-Ort{\'\i}",
  title =        "Out-of-core macromolecular simulations on
                 multithreaded architectures",
  journal =      j-CCPE,
  volume =       "27",
  number =       "6",
  pages =        "1540--1550",
  day =          "25",
  month =        apr,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3357",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Jul 25 19:54:07 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "31 Aug 2014",
}

@Article{Aliaga:2015:UPE,
  author =       "Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel
                 Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n
                 Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S.
                 Quintana-Ort{\'\i}",
  title =        "Unveiling the performance-energy trade-off in
                 iterative linear system solvers for multithreaded
                 processors",
  journal =      j-CCPE,
  volume =       "27",
  number =       "4",
  pages =        "885--904",
  day =          "25",
  month =        mar,
  year =         "2015",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3341",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Jul 25 19:54:06 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "9 Sep 2014",
}

@Article{Amer:2015:MRC,
  author =       "Abdelhalim Amer and Huiwei Lu and Yanjie Wei and Pavan
                 Balaji and Satoshi Matsuoka",
  title =        "{MPI+Threads}: runtime contention and remedies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "239--248",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hybrid MPI+Threads programming has emerged as an
                 alternative model to the ``MPI everywhere'' model to
                 better handle the increasing core density in cluster
                 nodes. While the MPI standard allows multithreaded
                 concurrent communication, such flexibility comes with
                 the cost of maintaining thread safety within the MPI
                 implementation, typically implemented using critical
                 sections. In contrast to previous works that studied
                 the importance of critical-section granularity in MPI
                 implementations, in this paper we investigate the
                 implication of critical-section arbitration on
                 communication performance. We first analyze the MPI
                 runtime when multithreaded concurrent communication
                 takes place on hierarchical memory systems. Our results
                 indicate that the mutex-based approach that most MPI
                 implementations use today can incur performance
                 penalties due to unfair arbitration. We then present
                 methods to mitigate these penalties with a first-come,
                 first-served arbitration and a priority locking scheme
                 that favors threads doing useful work. Through
                 evaluations using several benchmarks and applications,
                 we demonstrate up to 5-fold improvement in
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Axnix:2015:IZF,
  author =       "C. Axnix and G. Bayer and H. Bohm and J. von Buttlar
                 and M. S. Farrell and L. C. Heller and J. P. Kubala and
                 S. E. Lederer and R. Mansell and A. Nunez Mencias and
                 S. Usenbinz",
  title =        "{IBM z13} firmware innovations for simultaneous
                 multithreading and {I/O} virtualization",
  journal =      j-IBM-JRD,
  volume =       "59",
  number =       "??",
  pages =        "11:1--11:11",
  month =        "????",
  year =         "2015",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Oct 21 11:38:12 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
}

@Article{Bai:2015:SPA,
  author =       "Xiuxiu Bai and Endong Wang and Xiaoshe Dong and
                 Xingjun Zhang",
  title =        "A scalability prediction approach for multi-threaded
                 applications on manycore processors",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "11",
  pages =        "4072--4094",
  month =        nov,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-015-1505-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Jan 25 08:18:10 MST 2016",
  bibsource =    "http://link.springer.com/journal/11227/71/11;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-015-1505-x",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Bhatotia:2015:ITL,
  author =       "Pramod Bhatotia and Pedro Fonseca and Umut A. Acar and
                 Bj{\"o}rn B. Brandenburg and Rodrigo Rodrigues",
  title =        "{iThreads}: a Threading Library for Parallel
                 Incremental Computation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "645--659",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694371",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Incremental computation strives for efficient
                 successive runs of applications by re-executing only
                 those parts of the computation that are affected by a
                 given input change instead of recomputing everything
                 from scratch. To realize these benefits automatically,
                 we describe iThreads, a threading library for parallel
                 incremental computation. iThreads supports unmodified
                 shared-memory multithreaded programs: it can be used as
                 a replacement for pthreads by a simple exchange of
                 dynamically linked libraries, without even recompiling
                 the application code. To enable such an interface, we
                 designed algorithms and an implementation to operate at
                 the compiled binary code level by leveraging
                 MMU-assisted memory access tracking and process-based
                 thread isolation. Our evaluation on a multicore
                 platform using applications from the PARSEC and Phoenix
                 benchmarks and two case-studies shows significant
                 performance gains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Bogdanas:2015:KJC,
  author =       "Denis Bogdanas and Grigore Rosu",
  title =        "{K-Java}: a Complete Semantics of {Java}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "445--456",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676982",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents K-Java, a complete executable
                 formal semantics of Java 1.4. K-Java was extensively
                 tested with a test suite developed alongside the
                 project, following the Test Driven Development
                 methodology. In order to maintain clarity while
                 handling the great size of Java, the semantics was
                 split into two separate definitions --- a static
                 semantics and a dynamic semantics. The output of the
                 static semantics is a preprocessed Java program, which
                 is passed as input to the dynamic semantics for
                 execution. The preprocessed program is a valid Java
                 program, which uses a subset of the features of Java.
                 The semantics is applied to model-check multi-threaded
                 programs. Both the test suite and the static semantics
                 are generic and ready to be used in other Java-related
                 projects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Cai:2015:ADB,
  author =       "Yan Cai and Changjiang Jia and Shangru Wu and Ke Zhai
                 and Wing Kwong Chan",
  title =        "{ASN}: A Dynamic Barrier-Based Approach to
                 Confirmation of Deadlocks from Warnings for Large-Scale
                 Multithreaded Programs",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "26",
  number =       "1",
  pages =        "13--23",
  month =        jan,
  year =         "2015",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 12 13:58:35 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2015/01/06747310-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2015/01/06747310-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Chlipala:2015:NIM,
  author =       "Adam Chlipala",
  title =        "From Network Interface to Multithreaded {Web}
                 Applications: a Case Study in Modular Program
                 Verification",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "609--622",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677003",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many verifications of realistic software systems are
                 monolithic, in the sense that they define single global
                 invariants over complete system state. More modular
                 proof techniques promise to support reuse of component
                 proofs and even reduce the effort required to verify
                 one concrete system, just as modularity simplifies
                 standard software development. This paper reports on
                 one case study applying modular proof techniques in the
                 Coq proof assistant. To our knowledge, it is the first
                 modular verification certifying a system that combines
                 infrastructure with an application of interest to end
                 users. We assume a nonblocking API for managing TCP
                 networking streams, and on top of that we work our way
                 up to certifying multithreaded, database-backed Web
                 applications. Key verified components include a
                 cooperative threading library and an implementation of
                 a domain-specific language for XML processing. We have
                 deployed our case-study system on mobile robots, where
                 it interfaces with off-the-shelf components for
                 sensing, actuation, and control.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Chlipala:2015:UWS,
  author =       "Adam Chlipala",
  title =        "{Ur\slash Web}: a Simple Model for Programming the
                 {Web}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "153--165",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677004",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The World Wide Web has evolved gradually from a
                 document delivery platform to an architecture for
                 distributed programming. This largely unplanned
                 evolution is apparent in the set of interconnected
                 languages and protocols that any Web application must
                 manage. This paper presents Ur/Web, a domain-specific,
                 statically typed functional programming language with a
                 much simpler model for programming modern Web
                 applications. Ur/Web's model is unified, where programs
                 in a single programming language are compiled to other
                 ``Web standards'' languages as needed; supports novel
                 kinds of encapsulation of Web-specific state; and
                 exposes simple concurrency, where programmers can
                 reason about distributed, multithreaded applications
                 via a mix of transactions and cooperative preemption.
                 We give a tutorial introduction to the main features of
                 Ur/Web and discuss the language implementation and the
                 production Web applications that use it.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Curran:2015:IZM,
  author =       "B. W. Curran and C. Jacobi and J. J. Bonanno and D. A.
                 Schroter and K. J. Alexander and A. Puranik and M. M.
                 Helms",
  title =        "The {IBM z13} multithreaded microprocessor",
  journal =      j-IBM-JRD,
  volume =       "59",
  number =       "4--5",
  pages =        "1:1--1:13",
  month =        jul # "\slash " # sep,
  year =         "2015",
  CODEN =        "IBMJAE",
  ISSN =         "0018-8646 (print), 2151-8556 (electronic)",
  ISSN-L =       "0018-8646",
  bibdate =      "Wed Oct 21 11:38:12 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IBM Journal of Research and Development",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
}

@Article{Das:2015:SBP,
  author =       "Madan Das and Gabriel Southern and Jose Renau",
  title =        "Section-Based Program Analysis to Reduce Overhead of
                 Detecting Unsynchronized Thread Communication",
  journal =      j-TACO,
  volume =       "12",
  number =       "2",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2766451",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Aug 7 09:46:00 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most systems that test and verify parallel programs,
                 such as deterministic execution engines, data race
                 detectors, and software transactional memory systems,
                 require instrumenting loads and stores in an
                 application. This can cause a very significant runtime
                 and memory overhead compared to executing
                 uninstrumented code. Multithreaded programming
                 typically allows any thread to perform loads and stores
                 to any location in the process's address space
                 independently, and such tools monitor all these memory
                 accesses. However, many of the addresses in these
                 unsynchronized memory accesses are only used by a
                 single thread and do not affect other executing
                 threads. We propose Section-Based Program Analysis
                 (SBPA), a novel way to decompose the program into
                 disjoint code sections to identify and eliminate
                 instrumenting such loads and stores during program
                 compilation so that the program runtime overhead is
                 significantly reduced. Our analysis includes
                 improvements to pointer analysis and uses a few user
                 directives to increase the effectiveness of SBPA
                 further. We implemented SBPA for a deterministic
                 execution runtime environment and were able to
                 eliminate 51\% of dynamic memory access
                 instrumentations. When combined with directives, such
                 reduction increased to 63\%. We also integrated SBPA
                 with ThreadSanitizer, a state-of-the-art dynamic race
                 detector, and achieved a speedup of 2.43 (2.74 with
                 directives) on a geometric mean basis.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ding:2015:OCA,
  author =       "Wei Ding and Xulong Tang and Mahmut Kandemir and
                 Yuanrui Zhang and Emre Kultursay",
  title =        "Optimizing off-chip accesses in multicores",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "131--142",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737989",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In a network-on-chip (NoC) based manycore
                 architecture, an off-chip data access (main memory
                 access) needs to travel through the on-chip network,
                 spending considerable amount of time within the chip
                 (in addition to the memory access latency). In
                 addition, it contends with on-chip (cache) accesses as
                 both use the same NoC resources. In this paper,
                 focusing on data-parallel, multithreaded applications,
                 we propose a compiler-based off-chip data access
                 localization strategy, which places data elements in
                 the memory space such that an off-chip access traverses
                 a minimum number of links (hops) to reach the memory
                 controller that handles this access. This brings three
                 main benefits. First, the network latency of off-chip
                 accesses gets reduced; second, the network latency of
                 on-chip accesses gets reduced; and finally, the memory
                 latency of off-chip accesses improves, due to reduced
                 queue latencies. We present an experimental evaluation
                 of our optimization strategy using a set of 13
                 multithreaded application programs under both private
                 and shared last-level caches. The results collected
                 emphasize the importance of optimizing the off-chip
                 data accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Fang:2015:MMD,
  author =       "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and
                 Antonia Zhai and James Greensky and Gautham Beeraka and
                 Binyu Zang",
  title =        "Measuring Microarchitectural Details of Multi- and
                 Many-Core Memory Systems through Microbenchmarking",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687356",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As multicore and many-core architectures evolve, their
                 memory systems are becoming increasingly more complex.
                 To bridge the latency and bandwidth gap between the
                 processor and memory, they often use a mix of
                 multilevel private/shared caches that are either
                 blocking or nonblocking and are connected by high-speed
                 network-on-chip. Moreover, they also incorporate
                 hardware and software prefetching and simultaneous
                 multithreading (SMT) to hide memory latency. On such
                 multi- and many-core systems, to incorporate various
                 memory optimization schemes using compiler
                 optimizations and performance tuning techniques, it is
                 crucial to have microarchitectural details of the
                 target memory system. Unfortunately, such details are
                 often unavailable from vendors, especially for newly
                 released processors. In this article, we propose a
                 novel microbenchmarking methodology based on short
                 elapsed-time events (SETEs) to obtain comprehensive
                 memory microarchitectural details in multi- and
                 many-core processors. This approach requires detailed
                 analysis of potential interfering factors that could
                 affect the intended behavior of such memory systems. We
                 lay out effective guidelines to control and mitigate
                 those interfering factors. Taking the impact of SMT
                 into consideration, our proposed methodology not only
                 can measure traditional cache/memory latency and
                 off-chip bandwidth but also can uncover the details of
                 software and hardware prefetching units not attempted
                 in previous studies. Using the newly released Intel
                 Xeon Phi many-core processor (with in-order cores) as
                 an example, we show how we can use a set of
                 microbenchmarks to determine various microarchitectural
                 features of its memory system (many are undocumented
                 from vendors). To demonstrate the portability and
                 validate the correctness of such a methodology, we use
                 the well-documented Intel Sandy Bridge multicore
                 processor (with out-of-order cores) as another example,
                 where most data are available and can be validated.
                 Moreover, to illustrate the usefulness of the measured
                 data, we do a multistage coordinated data prefetching
                 case study on both Xeon Phi and Sandy Bridge and show
                 that by using the measured data, we can achieve 1.3X
                 and 1.08X performance speedup, respectively, compared
                 to the state-of-the-art Intel ICC compiler. We believe
                 that these measurements also provide useful insights
                 into memory optimization, analysis, and modeling of
                 such multicore and many-core architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Farzan:2015:PSU,
  author =       "Azadeh Farzan and Zachary Kincaid and Andreas
                 Podelski",
  title =        "Proof Spaces for Unbounded Parallelism",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "407--420",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677012",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we present a new approach to
                 automatically verify multi-threaded programs which are
                 executed by an unbounded number of threads running in
                 parallel. The starting point for our work is the
                 problem of how we can leverage existing automated
                 verification technology for sequential programs
                 (abstract interpretation, Craig interpolation,
                 constraint solving, etc.) for multi-threaded programs.
                 Suppose that we are given a correctness proof for a
                 trace of a program (or for some other program
                 fragment). We observe that the proof can always be
                 decomposed into a finite set of Hoare triples, and we
                 ask what can be proved from the finite set of Hoare
                 triples using only simple combinatorial inference rules
                 (without access to a theorem prover and without the
                 possibility to infer genuinely new Hoare triples)? We
                 introduce a proof system where one proves the
                 correctness of a multi-threaded program by showing that
                 for each trace of the program, there exists a
                 correctness proof in the space of proofs that are
                 derivable from a finite set of axioms using simple
                 combinatorial inference rules. This proof system is
                 complete with respect to the classical proof method of
                 establishing an inductive invariant (which uses thread
                 quantification and control predicates). Moreover, it is
                 possible to algorithmically check whether a given set
                 of axioms is sufficient to prove the correctness of a
                 multi-threaded program, using ideas from
                 well-structured transition systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Ghosh:2015:NCC,
  author =       "Ammlan Ghosh and Rituparna Chaki and Nabendu Chaki",
  title =        "A new concurrency control mechanism for multi-threaded
                 environment using transactional memory",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "71",
  number =       "11",
  pages =        "4095--4115",
  month =        nov,
  year =         "2015",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-015-1507-8",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Jan 25 08:18:10 MST 2016",
  bibsource =    "http://link.springer.com/journal/11227/71/11;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s11227-015-1507-8;
                 http://link.springer.com/content/pdf/10.1007/s11227-015-1507-8.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Halappanavar:2015:CLL,
  author =       "Mahantesh Halappanavar and Alex Pothen and Ariful Azad
                 and Fredrik Manne and Johannes Langguth and Arif Khan",
  title =        "Codesign Lessons Learned from Implementing Graph
                 Matching on Multithreaded Architectures",
  journal =      j-COMPUTER,
  volume =       "48",
  number =       "8",
  pages =        "46--55",
  month =        aug,
  year =         "2015",
  CODEN =        "CPTRB4",
  DOI =          "https://doi.org/10.1109/MC.2015.215",
  ISSN =         "0018-9162 (print), 1558-0814 (electronic)",
  ISSN-L =       "0018-9162",
  bibdate =      "Tue Nov 3 07:04:37 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://csdl.computer.org/csdl/mags/co/2015/08/mco2015080046-abs.html",
  abstract-URL = "http://csdl.computer.org/csdl/mags/co/2015/08/mco2015080046-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/portal/web/csdl/magazines/computer",
}

@Article{Hottelier:2015:SLE,
  author =       "Thibaud Hottelier and Rastislav Bodik",
  title =        "Synthesis of layout engines from relational
                 constraints",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "74--88",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814291",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an algorithm for synthesizing efficient
                 document layout engines from compact relational
                 specifications. These specifications are compact in
                 that a single specification can produce multiple
                 engines, each for a distinct layout situation, i.e., a
                 different combination of known vs. unknown attributes.
                 Technically, our specifications are relational
                 attribute grammars, while our engines are functional
                 attribute grammars. By synthesizing functions from
                 relational constraints, we obviate the need for
                 constraint solving at runtime, because functional
                 attribute grammars can be easily evaluated according to
                 a fixed schedule, sidestepping the backtracking search
                 performed by constraint solvers. Our experiments show
                 that we can generate layout engines for non-trivial
                 data visualizations, and that our synthesized engines
                 are between 39- and 200-times faster than
                 general-purpose constraint solvers. Relational
                 specifications of layout give rise to synthesis
                 problems that have previously proved intractable. Our
                 algorithm exploits the hierarchical, grammar-based
                 structure of the specification, decomposing the
                 specification into smaller subproblems, which can be
                 tackled with off-the-shelf synthesis procedures. The
                 new synthesis problem then becomes the composition of
                 the functions thus generated into a correct attribute
                 grammar, which might be recursive. We show how to solve
                 this problem by efficient reduction to an SMT
                 problem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Huang:2015:COM,
  author =       "Kai Huang and Min Yu and Rongjie Yan and Xiaomeng
                 Zhang and Xiaolang Yan and Lisane Brisolara and Ahmed
                 Amine Jerraya and Jiong Feng",
  title =        "Communication Optimizations for Multithreaded Code
                 Generation from {Simulink} Models",
  journal =      j-TECS,
  volume =       "14",
  number =       "3",
  pages =        "59:1--59:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2644811",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Sat Dec 9 08:08:56 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Communication frequency is increasing with the growing
                 complexity of emerging embedded applications and the
                 number of processors in the implemented multiprocessor
                 SoC architectures. In this article, we consider the
                 issue of communication cost reduction during
                 multithreaded code generation from partitioned Simulink
                 models to help designers in code optimization to
                 improve system performance. We first propose a
                 technique combining message aggregation and
                 communication pipeline methods, which groups
                 communications with the same destinations and sources
                 and parallelizes communication and computation tasks.
                 We also present a method to apply static analysis and
                 dynamic emulation for efficient communication buffer
                 allocation to further reduce synchronization cost and
                 increase processor utilization. The existing cyclic
                 dependency in the mapped model may hinder the
                 effectiveness of the two techniques. We further propose
                 a set of optimizations involving repartition with
                 strongly connected threads to maximize the degree of
                 communication reduction and preprocessing strategies
                 with available delays in the model to reduce the number
                 of communication channels that cannot be optimized.
                 Experimental results demonstrate the advantages of the
                 proposed optimizations with 11--143\% throughput
                 improvement.",
  acknowledgement = ack-nhfb,
  articleno =    "59",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Hussein:2015:DRM,
  author =       "Ahmed Hussein and Antony L. Hosking and Mathias Payer
                 and Christopher A. Vick",
  title =        "Don't race the memory bus: taming the {GC} leadfoot",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "15--27",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754182",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic voltage and frequency scaling (DVFS) is
                 ubiquitous on mobile devices as a mechanism for saving
                 energy. Reducing the clock frequency of a processor
                 allows a corresponding reduction in power consumption,
                 as does turning off idle cores. Garbage collection is a
                 canonical example of the sort of memory-bound workload
                 that best responds to such scaling. Here, we explore
                 the impact of frequency scaling for garbage collection
                 in a real mobile device running Android's Dalvik
                 virtual machine, which uses a concurrent collector. By
                 controlling the frequency of the core on which the
                 concurrent collector thread runs we can reduce power
                 significantly. Running established multi-threaded
                 benchmarks shows that total processor energy can be
                 reduced up to 30\%, with end-to-end performance loss of
                 at most 10\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Jeon:2015:MTH,
  author =       "Yongkweon Jeon and Sungroh Yoon",
  title =        "Multi-Threaded Hierarchical Clustering by Parallel
                 Nearest-Neighbor Chaining",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "26",
  number =       "9",
  pages =        "2534--2548",
  month =        sep,
  year =         "2015",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Mon Sep 28 12:20:25 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2015/09/06893001.pdf",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2015/09/06893001-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Kandemir:2015:MRR,
  author =       "Mahmut Kandemir and Hui Zhao and Xulong Tang and
                 Mustafa Karakoy",
  title =        "Memory Row Reuse Distance and its Role in Optimizing
                 Application Performance",
  journal =      j-SIGMETRICS,
  volume =       "43",
  number =       "1",
  pages =        "137--149",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2796314.2745867",
  ISSN =         "0163-5999 (print), 1557-9484 (electronic)",
  ISSN-L =       "0163-5999",
  bibdate =      "Fri Sep 18 06:59:51 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib",
  abstract =     "Continuously increasing dataset sizes of large-scale
                 applications overwhelm on-chip cache capacities and
                 make the performance of last-level caches (LLC)
                 increasingly important. That is, in addition to
                 maximizing LLC hit rates, it is becoming equally
                 important to reduce LLC miss latencies. One of the
                 critical factors that influence LLC miss latencies is
                 row-buffer locality (i.e., the fraction of LLC misses
                 that hit in the large buffer attached to a memory
                 bank). While there has been a plethora of recent works
                 on optimizing row-buffer performance, to our knowledge,
                 there is no study that quantifies the full potential of
                 row-buffer locality and impact of maximizing it on
                 application performance. Focusing on multithreaded
                 applications, the first contribution of this paper is
                 the definition of a new metric called (memory) row
                 reuse distance (RRD). We show that, while intra-core
                 RRDs are relatively small (increasing the chances for
                 row-buffer hits), inter-core RRDs are quite large
                 (increasing the chances for row-buffer misses).
                 Motivated by this, we propose two schemes that measure
                 the maximum potential benefits that could be obtained
                 from minimizing RRDs, to the extent allowed by program
                 dependencies. Specifically, one of our schemes
                 (Scheme-I) targets only intra-core RRDs, whereas the
                 other one (Scheme-II) aims at reducing both intra-core
                 RRDs and inter-core RRDs. Our experimental evaluations
                 demonstrate that (i) Scheme-I reduces intra-core RRDs
                 but increases inter-core RRDs; (ii) Scheme-II reduces
                 inter-core RRDs significantly while achieving a similar
                 behavior to Scheme-I as far as intra-core RRDs are
                 concerned; (iii) Scheme-I and Scheme-II improve
                 execution times of our applications by 17\% and 21\%,
                 respectively, on average; and (iv) both our schemes
                 deliver consistently good results under different
                 memory request scheduling policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGMETRICS Performance Evaluation Review",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J618",
}

@Article{Kasikci:2015:ACD,
  author =       "Baris Kasikci and Cristian Zamfir and George Candea",
  title =        "Automated Classification of Data Races Under Both
                 Strong and Weak Memory Models",
  journal =      j-TOPLAS,
  volume =       "37",
  number =       "3",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2015",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2734118",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jun 19 05:36:55 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Data races are one of the main causes of concurrency
                 problems in multithreaded programs. Whether all data
                 races are bad, or some are harmful and others are
                 harmless, is still the subject of vigorous scientific
                 debate [Narayanasamy et al. 2007; Boehm 2012]. What is
                 clear, however, is that today's code has many data
                 races [Kasikci et al. 2012; Jin et al. 2012; Erickson
                 et al. 2010], and fixing data races without introducing
                 bugs is time consuming [Godefroid and Nagappan 2008].
                 Therefore, it is important to efficiently identify data
                 races in code and understand their consequences to
                 prioritize their resolution. We present Portend$^+$, a
                 tool that not only detects races but also automatically
                 classifies them based on their potential consequences:
                 Could they lead to crashes or hangs? Could their
                 effects be visible outside the program? Do they appear
                 to be harmless? How do their effects change under weak
                 memory models? Our proposed technique achieves high
                 accuracy by efficiently analyzing multiple paths and
                 multiple thread schedules in combination, and by
                 performing symbolic comparison between program outputs.
                 We ran Portend$^+$ on seven real-world applications: it
                 detected 93 true data races and correctly classified 92
                 of them, with no human effort. Six of them were harmful
                 races. Portend$^+$ 's classification accuracy is up to
                 89\% higher than that of existing tools, and it
                 produces easy-to-understand evidence of the
                 consequences of ``harmful'' races, thus both proving
                 their harmfulness and making debugging easier. We
                 envision Portend$^+$ being used for testing and
                 debugging, as well as for automatically triaging bug
                 reports.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Kerrison:2015:EMS,
  author =       "Steve Kerrison and Kerstin Eder",
  title =        "Energy Modeling of Software for a Hardware
                 Multithreaded Embedded Microprocessor",
  journal =      j-TECS,
  volume =       "14",
  number =       "3",
  pages =        "56:1--56:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700104",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Sat Dec 9 08:08:56 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "This article examines a hardware multithreaded
                 microprocessor and discusses the impact such an
                 architecture has on existing software energy modeling
                 techniques. A framework is constructed for analyzing
                 the energy behavior of the XMOS XS1-L multithreaded
                 processor and a variation on existing software energy
                 models is proposed, based on analysis of collected
                 energy data. It is shown that by combining execution
                 statistics with sufficient data on the processor's
                 thread activity and instruction execution costs, a
                 multithreaded software energy model used with
                 Instruction Set Simulation can yield an average error
                 margin of less than 7\%.",
  acknowledgement = ack-nhfb,
  articleno =    "56",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Kestor:2015:TPD,
  author =       "Gokcen Kestor and Osman S. Unsal and Adrian Cristal
                 and Serdar Tasiran",
  title =        "{TRADE}: Precise Dynamic Race Detection for Scalable
                 Transactional Memory Systems",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786021",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As other multithreaded programs, transactional memory
                 (TM) programs are prone to race conditions. Previous
                 work focuses on extending existing definitions of data
                 race for lock-based applications to TM applications,
                 which requires all transactions to be totally ordered
                 ``as if'' serialized by a global lock. This approach
                 poses implementation constraints on the STM that
                 severely limits TM applications' performance. This
                 article shows that forcing total ordering among all
                 running transactions, while sufficient, is not
                 necessary. We introduce an alternative data race
                 definition, relaxed transactional data race, that
                 requires ordering of only conflicting transactions. The
                 advantages of our relaxed definition are twofold:
                 First, unlike the previous definition, this definition
                 can be applied to a wide range of TMs, including those
                 that do not enforce transaction total ordering. Second,
                 within a single execution, it exposes a higher number
                 of data races, which considerably reduces debugging
                 time. Based on this definition, we propose a novel and
                 precise race detection tool for C/C++ TM applications
                 (TRADE), which detects data races by tracking
                 happens-before edges among conflicting transactions.
                 Our experiments reveal that TRADE precisely detects
                 data races for STAMP applications running on modern
                 STMs with overhead comparable to state-of-the-art race
                 detectors for lock-based applications. Our experiments
                 also show that in a single run, TRADE identifies
                 several races not discovered by 10 separate runs of a
                 race detection tool based on the previous data race
                 definition.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kocberber:2015:AMA,
  author =       "Onur Kocberber and Babak Falsafi and Boris Grot",
  title =        "Asynchronous memory access chaining",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "9",
  number =       "4",
  pages =        "252--263",
  month =        dec,
  year =         "2015",
  CODEN =        "????",
  ISSN =         "2150-8097",
  bibdate =      "Sat Dec 19 17:42:25 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "In-memory databases rely on pointer-intensive data
                 structures to quickly locate data in memory. A single
                 lookup operation in such data structures often exhibits
                 long-latency memory stalls due to dependent pointer
                 dereferences. Hiding the memory latency by launching
                 additional memory accesses for other lookups is an
                 effective way of improving performance of
                 pointer-chasing codes (e.g., hash table probes, tree
                 traversals). The ability to exploit such inter-lookup
                 parallelism is beyond the reach of modern out-of-order
                 cores due to the limited size of their instruction
                 window. Instead, recent work has proposed software
                 prefetching techniques that exploit inter-lookup
                 parallelism by arranging a set of independent lookups
                 into a group or a pipeline, and navigate their
                 respective pointer chains in a synchronized fashion.
                 While these techniques work well for highly regular
                 access patterns, they break down in the face of
                 irregularity across lookups. Such irregularity includes
                 variable-length pointer chains, early exit, and
                 read/write dependencies. This work introduces
                 Asynchronous Memory Access Chaining (AMAC), a new
                 approach for exploiting inter-lookup parallelism to
                 hide the memory access latency. AMAC achieves high
                 dynamism in dealing with irregularity across lookups by
                 maintaining the state of each lookup separately from
                 that of other lookups. This feature enables AMAC to
                 initiate a new lookup as soon as any of the in-flight
                 lookups complete. In contrast, the static arrangement
                 of lookups into a group or pipeline in existing
                 techniques precludes such adaptivity. Our results show
                 that AMAC matches or outperforms state-of-the-art
                 prefetching techniques on regular access patterns,
                 while delivering up to 2.3x higher performance under
                 irregular data structure lookups. AMAC fully utilizes
                 the available microarchitectural resources, generating
                 the maximum number of memory accesses allowed by
                 hardware in both single- and multi-threaded execution
                 modes.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Kubica:2015:PHT,
  author =       "Bartlomiej Jacek Kubica",
  title =        "Presentation of a highly tuned multithreaded interval
                 solver for underdetermined and well-determined
                 nonlinear systems",
  journal =      j-NUMER-ALGORITHMS,
  volume =       "70",
  number =       "4",
  pages =        "929--963",
  month =        dec,
  year =         "2015",
  CODEN =        "NUALEG",
  DOI =          "https://doi.org/10.1007/s11075-015-9980-y",
  ISSN =         "1017-1398 (print), 1572-9265 (electronic)",
  ISSN-L =       "1017-1398",
  bibdate =      "Mon Jan 25 08:55:03 MST 2016",
  bibsource =    "http://link.springer.com/journal/11075/70/4;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/numeralgorithms.bib",
  URL =          "http://link.springer.com/article/10.1007/s11075-015-9980-y;
                 http://link.springer.com/content/pdf/10.1007/s11075-015-9980-y.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "Numerical Algorithms",
  journal-URL =  "http://link.springer.com/journal/11075",
}

@Article{Kuszmaul:2015:SSF,
  author =       "Bradley C. Kuszmaul",
  title =        "{SuperMalloc}: a super fast multithreaded {\tt malloc}
                 for 64-bit machines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "41--55",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754178",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "SuperMalloc is an implementation of malloc(3)
                 originally designed for X86 Hardware Transactional
                 Memory (HTM)@. It turns out that the same design
                 decisions also make it fast even without HTM@. For the
                 malloc-test benchmark, which is one of the most
                 difficult workloads for an allocator, with one thread
                 SuperMalloc is about 2.1 times faster than the best of
                 DLmalloc, JEmalloc, Hoard, and TBBmalloc; with 8
                 threads and HTM, SuperMalloc is 2.75 times faster; and
                 on 32 threads without HTM SuperMalloc is 3.4 times
                 faster. SuperMalloc generally compares favorably with
                 the other allocators on speed, scalability, speed
                 variance, memory footprint, and code size. SuperMalloc
                 achieves these performance advantages using less than
                 half as much code as the alternatives. SuperMalloc
                 exploits the fact that although physical memory is
                 always precious, virtual address space on a 64-bit
                 machine is relatively cheap. It allocates 2 chunks
                 which contain objects all the same size. To translate
                 chunk numbers to chunk metadata, SuperMalloc uses a
                 simple array (most of which is uncommitted to physical
                 memory). SuperMalloc takes care to avoid associativity
                 conflicts in the cache: most of the size classes are a
                 prime number of cache lines, and nonaligned huge
                 accesses are randomly aligned within a page. Objects
                 are allocated from the fullest non-full page in the
                 appropriate size class. For each size class,
                 SuperMalloc employs a 10-object per-thread cache, a
                 per-CPU cache that holds about a level-2-cache worth of
                 objects per size class, and a global cache that is
                 organized to allow the movement of many objects between
                 a per-CPU cache and the global cache using $ O(1) $
                 instructions. SuperMalloc prefetches everything it can
                 before starting a critical section, which makes the
                 critical sections run fast, and for HTM improves the
                 odds that the transaction will commit.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Lai:2015:SAM,
  author =       "Bo-Cheng Charles Lai and Kun-Chun Li and Guan-Ru Li
                 and Chin-Hsuan Chiang",
  title =        "Self adaptable multithreaded object detection on
                 embedded multicore systems",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "78",
  number =       "??",
  pages =        "25--38",
  month =        apr,
  year =         "2015",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Mar 21 09:26:08 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731515000192",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315/",
}

@Article{Lal:2015:DID,
  author =       "Akash Lal and Shaz Qadeer",
  title =        "{DAG} inlining: a decision procedure for
                 reachability-modulo-theories in hierarchical programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "280--290",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737987",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A hierarchical program is one with multiple procedures
                 but no loops or recursion. This paper studies the
                 problem of deciding reachability queries in
                 hierarchical programs where individual statements can
                 be encoded in a decidable logic (say in SMT). This
                 problem is fundamental to verification and most
                 directly applicable to doing bounded reachability in
                 programs, i.e., reachability under a bound on the
                 number of loop iterations and recursive calls. The
                 usual method of deciding reachability in hierarchical
                 programs is to first inline all procedures and then do
                 reachability on the resulting single-procedure program.
                 Such inlining unfolds the call graph of the program to
                 a tree and may lead to an exponential increase in the
                 size of the program. We design and evaluate a method
                 called DAG inlining that unfolds the call graph to a
                 directed acyclic graph (DAG) instead of a tree by
                 sharing the bodies of procedures at certain points
                 during inlining. DAG inlining can produce much more
                 compact representations than tree inlining.
                 Empirically, we show that it leads to significant
                 improvements in the running time of a state-of-the-art
                 verifier.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{LaSalle:2015:MTM,
  author =       "Dominique LaSalle and George Karypis",
  title =        "Multi-threaded modularity based graph clustering using
                 the multilevel paradigm",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "76",
  number =       "??",
  pages =        "66--80",
  month =        feb,
  year =         "2015",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Mon Mar 9 10:30:03 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731514001750",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315/",
}

@Article{Lashgar:2015:CSR,
  author =       "Ahmad Lashgar and Ebad Salehi and Amirali Baniasadi",
  title =        "A Case Study in Reverse Engineering {GPGPUs}:
                 Outstanding Memory Handling Resources",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "15--21",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927968",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "During recent years, GPU micro-architectures have
                 changed dramatically, evolving into powerful many-core
                 deep-multithreaded platforms for parallel workloads.
                 While important micro-architectural modifications
                 continue to appear in every new generation of these
                 processors, unfortunately, little is known about the
                 details of these innovative designs. One of the key
                 questions in understanding GPUs is how they deal with
                 outstanding memory misses. Our goal in this study is to
                 find answers to this question. To this end, we develop
                 a set of micro-benchmarks in CUDA to understand the
                 outstanding memory requests handling resources.
                 Particularly, we study two NVIDIA GPGPUs (Fermi and
                 Kepler) and estimate their capability in handling
                 outstanding memory requests. We show that Kepler can
                 issue nearly 32X higher number of outstanding memory
                 requests, compared to Fermi. We explain this
                 enhancement by Kepler's architectural modifications in
                 outstanding memory request handling resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Liu:2015:LRT,
  author =       "Peng Liu and Xiangyu Zhang and Omer Tripp and Yunhui
                 Zheng",
  title =        "{Light}: replay via tightly bounded recording",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "55--64",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reproducing concurrency bugs is a prominent challenge.
                 Existing techniques either rely on recording very fine
                 grained execution information and hence have high
                 runtime overhead, or strive to log as little
                 information as possible but provide no guarantee in
                 reproducing a bug. We present Light, a technique that
                 features much lower overhead compared to techniques
                 based on fine grained recording, and that guarantees to
                 reproduce concurrent bugs. We leverage and formally
                 prove that recording flow dependences is the necessary
                 and sufficient condition to reproduce a concurrent bug.
                 The flow dependences, together with the thread local
                 orders that can be automatically inferred (and hence
                 not logged), are encoded as scheduling constraints. An
                 SMT solver is used to derive a replay schedule, which
                 is guaranteed to exist even though it may be different
                 from the original schedule. Our experiments show that
                 Light has only 44\% logging overhead, almost one order
                 of magnitude lower than the state of the art techniques
                 relying on logging memory accesses. Its space overhead
                 is only 10\% of those techniques. Light can also
                 reproduce all the bugs we have collected whereas
                 existing techniques miss some of them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Machado:2015:CDD,
  author =       "Nuno Machado and Brandon Lucia and Lu{\'\i}s
                 Rodrigues",
  title =        "Concurrency debugging with differential schedule
                 projections",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "586--595",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737973",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Symbiosis: a concurrency debugging
                 technique based on novel differential schedule
                 projections (DSPs). A DSP shows the small set of memory
                 operations and data-flows responsible for a failure, as
                 well as a reordering of those elements that avoids the
                 failure. To build a DSP, Symbiosis first generates a
                 full, failing, multithreaded schedule via thread path
                 profiling and symbolic constraint solving. Symbiosis
                 selectively reorders events in the failing schedule to
                 produce a non-failing, alternate schedule. A DSP
                 reports the ordering and data-flow differences between
                 the failing and non-failing schedules. Our evaluation
                 on buggy real-world software and benchmarks shows that,
                 in practical time, Symbiosis generates DSPs that both
                 isolate the small fraction of event orders and
                 data-flows responsible for the failure, and show which
                 event reorderings prevent failing. In our experiments,
                 DSPs contain 81\% fewer events and 96\% less data-flows
                 than the full failure-inducing schedules. Moreover, by
                 allowing developers to focus on only a few events, DSPs
                 reduce the amount of time required to find a valid
                 fix.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Makreshanski:2015:LSE,
  author =       "Darko Makreshanski and Justin Levandoski and Ryan
                 Stutsman",
  title =        "To lock, swap, or elide: on the interplay of hardware
                 transactional memory and lock-free indexing",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "8",
  number =       "11",
  pages =        "1298--1309",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/2809974.2809990",
  ISSN =         "2150-8097",
  bibdate =      "Thu Jul 30 16:13:08 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "The release of hardware transactional memory (HTM) in
                 commodity CPUs has major implications on the design and
                 implementation of main-memory databases, especially on
                 the architecture of high-performance lock-free indexing
                 methods at the core of several of these systems. This
                 paper studies the interplay of HTM and lock-free
                 indexing methods. First, we evaluate whether HTM will
                 obviate the need for crafty lock-free index designs by
                 integrating it in a traditional B-tree architecture.
                 HTM performs well for simple data sets with small
                 fixed-length keys and payloads, but its benefits
                 disappear for more complex scenarios (e.g., larger
                 variable-length keys and payloads), making it
                 unattractive as a general solution for achieving high
                 performance. Second, we explore fundamental differences
                 between HTM-based and lock-free B-tree designs. While
                 lock-freedom entails design complexity and extra
                 mechanism, it has performance advantages in several
                 scenarios, especially high-contention cases where
                 readers proceed uncontested (whereas HTM aborts
                 readers). Finally, we explore the use of HTM as a
                 method to simplify lock-free design. We find that using
                 HTM to implement a multi-word compare-and-swap greatly
                 reduces lock-free programming complexity at the cost of
                 only a 10-15\% performance degradation. Our study uses
                 two state-of-the-art index implementations: a
                 memory-optimized B-tree extended with HTM to provide
                 multi-threaded concurrency and the Bw-tree lock-free
                 B-tree used in several Microsoft production
                 environments.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Markovic:2015:TLS,
  author =       "Nikola Markovic and Daniel Nemirovsky and Osman Unsal
                 and Mateo Valero and Adrian Cristal",
  title =        "Thread Lock Section-Aware Scheduling on Asymmetric
                 Single-{ISA} Multi-Core",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "160--163",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2357805",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As thread level parallelism in applications has
                 continued to expand, so has research in chip multi-core
                 processors. As more and more applications become
                 multi-threaded we expect to find a growing number of
                 threads executing on a machine. As a consequence, the
                 operating system will require increasingly larger
                 amounts of CPU time to schedule these threads
                 efficiently. Instead of perpetuating the trend of
                 performing more complex thread scheduling in the
                 operating system, we propose a scheduling mechanism
                 that can be efficiently implemented in hardware as
                 well. Our approach of identifying multi-threaded
                 application bottlenecks such as thread synchronization
                 sections complements the Fairness-aware Scheduler
                 method. It achieves an average speed up of 11.5 percent
                 (geometric mean) compared to the state-of-the-art
                 Fairness-aware Scheduler.",
  acknowledgement = ack-nhfb,
  affiliation =  "Markovic, N (Reprint Author), Barcelona Supercomputing
                 Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky,
                 Daniel; Unsal, Osman; Valero, Mateo, Barcelona
                 Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola;
                 Nemirovsky, Daniel; Valero, Mateo, Univ Politecn
                 Cataluna, Barcelona, Spain. Cristal, Adrian, Univ
                 Politecn Cataluna, Barcelona Supercomputing Ctr,
                 E-08028 Barcelona, Spain. Cristal, Adrian, Artificial
                 Intelligence Res Inst Spanish Natl Res, Barcelona,
                 Spain.",
  author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es
                 osman.unsal@bsc.es mateo.valero@bsc.es
                 adrian.cristal@bsc.es",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Asymmetric chip multiprocessor (ACMP); HW/SW thread
                 scheduling; multi-threaded applications",
  number-of-cited-references = "17",
  ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero,
                 Mateo/0000-0003-2917-2482",
  research-areas = "Computer Science",
  researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero,
                 Mateo/L-5709-2014",
  times-cited =  "7",
  unique-id =    "Markovic:2015:TLS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Matheou:2015:ASD,
  author =       "George Matheou and Paraskevas Evripidou",
  title =        "Architectural Support for Data-Driven Execution",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686874",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The exponential growth of sequential processors has
                 come to an end, and thus, parallel processing is
                 probably the only way to achieve performance growth. We
                 propose the development of parallel architectures based
                 on data-driven scheduling. Data-driven scheduling
                 enforces only a partial ordering as dictated by the
                 true data dependencies, which is the minimum
                 synchronization possible. This is very beneficial for
                 parallel processing because it enables it to exploit
                 the maximum possible parallelism. We provide
                 architectural support for data-driven execution for the
                 Data-Driven Multithreading (DDM) model. In the past,
                 DDM has been evaluated mostly in the form of virtual
                 machines. The main contribution of this work is the
                 development of a highly efficient hardware support for
                 data-driven execution and its integration into a
                 multicore system with eight cores on a Virtex-6 FPGA.
                 The DDM semantics make barriers and cache coherence
                 unnecessary, which reduces the synchronization
                 latencies significantly and makes the cache simpler.
                 The performance evaluation has shown that the support
                 for data-driven execution is very efficient with
                 negligible overheads. Our prototype can support very
                 small problem sizes (matrix $ 16 \times 16$) and
                 ultra-lightweight threads (block of $ 4 \times 4$) that
                 achieve speedups close to linear. Such results cannot
                 be achieved by software-based systems.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{McCartney:2015:SMT,
  author =       "W. P. McCartney and N. Sridhar",
  title =        "Stackless Multi-Threading for Embedded Systems",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "64",
  number =       "10",
  pages =        "2940--2952",
  month =        "????",
  year =         "2015",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2014.2378256",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Oct 13 06:51:51 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Mehta:2015:MTP,
  author =       "Kshitij Mehta and Edgar Gabriel",
  title =        "Multi-Threaded Parallel {I/O} for {OpenMP}
                 Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "43",
  number =       "2",
  pages =        "286--309",
  month =        apr,
  year =         "2015",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-014-0306-9",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Aug 8 12:34:16 MDT 2015",
  bibsource =    "http://link.springer.com/journal/10766/43/2;
                 https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-014-0306-9",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Mitchell:2015:GIA,
  author =       "Nathan Mitchell and Court Cutting and Eftychios
                 Sifakis",
  title =        "{GRIDiron}: an interactive authoring and cognitive
                 training foundation for reconstructive plastic surgery
                 procedures",
  journal =      j-TOG,
  volume =       "34",
  number =       "4",
  pages =        "43:1--43:??",
  month =        aug,
  year =         "2015",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/2766918",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Tue Jul 28 17:22:44 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "We present an interactive simulation framework for
                 authoring surgical procedures of soft tissue
                 manipulation using physics-based simulation to animate
                 the flesh. This interactive authoring tool can be used
                 by clinical educators to craft three-dimensional
                 illustrations of the intricate maneuvers involved in
                 craniofacial repairs, in contrast to two-dimensional
                 sketches and still photographs which are the medium
                 used to describe these procedures in the traditional
                 surgical curriculum. Our virtual environment also
                 allows surgeons-in-training to develop cognitive skills
                 for craniofacial surgery by experimenting with
                 different approaches to reconstructive challenges,
                 adapting stock techniques to flesh regions with
                 nonstandard shape, and reach preliminary predictions
                 about the feasibility of a given repair plan. We use a
                 Cartesian grid-based embedded discretization of
                 nonlinear elasticity to maximize regularity, and expose
                 opportunities for aggressive multithreading and SIMD
                 accelerations. Using a grid-based approach facilitates
                 performance and scalability, but constrains our ability
                 to capture the topology of thin surgical incisions. We
                 circumvent this restriction by hybridizing the
                 grid-based discretization with an explicit hexahedral
                 mesh representation in regions where the embedding mesh
                 necessitates overlap or nonmanifold connectivity.
                 Finally, we detail how the front-end of our system can
                 run on lightweight clients, while the core simulation
                 capability can be hosted on a dedicated server and
                 delivered as a network service.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Nelson:2015:RGH,
  author =       "Thomas Nelson and Geoffrey Belter and Jeremy G. Siek
                 and Elizabeth Jessup and Boyana Norris",
  title =        "Reliable Generation of High-Performance Matrix
                 Algebra",
  journal =      j-TOMS,
  volume =       "41",
  number =       "3",
  pages =        "18:1--18:27",
  month =        jun,
  year =         "2015",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2629698",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Jun 3 17:59:32 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "Scientific programmers often turn to vendor-tuned
                 Basic Linear Algebra Subprograms (BLAS) to obtain
                 portable high performance. However, many numerical
                 algorithms require several BLAS calls in sequence, and
                 those successive calls do not achieve optimal
                 performance. The entire sequence needs to be optimized
                 in concert. Instead of vendor-tuned BLAS, a programmer
                 could start with source code in Fortran or C (e.g.,
                 based on the Netlib BLAS) and use a state-of-the-art
                 optimizing compiler. However, our experiments show that
                 optimizing compilers often attain only one-quarter of
                 the performance of hand-optimized code. In this
                 article, we present a domain-specific compiler for
                 matrix kernels, the Build to Order BLAS (BTO), that
                 reliably achieves high performance using a scalable
                 search algorithm for choosing the best combination of
                 loop fusion, array contraction, and multithreading for
                 data parallelism. The BTO compiler generates code that
                 is between 16\% slower and 39\% faster than
                 hand-optimized code.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Nguyen:2015:RCC,
  author =       "Ph{\'u}c C. Nguy{\v{e}}n and David {Van Horn}",
  title =        "Relatively complete counterexamples for higher-order
                 programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "446--456",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737971",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we study the problem of generating
                 inputs to a higher-order program causing it to error.
                 We first approach the problem in the setting of PCF, a
                 typed, core functional language and contribute the
                 first relatively complete method for constructing
                 counterexamples for PCF programs. The method is
                 relatively complete with respect to a first-order
                 solver over the base types of PCF. In practice, this
                 means an SMT solver can be used for the effective,
                 automated generation of higher-order counterexamples
                 for a large class of programs. We achieve this result
                 by employing a novel form of symbolic execution for
                 higher-order programs. The remarkable aspect of this
                 symbolic execution is that even though symbolic
                 higher-order inputs and values are considered, the path
                 condition remains a first-order formula. Our handling
                 of symbolic function application enables the
                 reconstruction of higher-order counterexamples from
                 this first-order formula. After establishing our main
                 theoretical results, we sketch how to apply the
                 approach to untyped, higher-order, stateful languages
                 with first-class contracts and show how counterexample
                 generation can be used to detect contract violations in
                 this setting. To validate our approach, we implement a
                 tool generating counterexamples for erroneous modules
                 written in Racket.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Pager:2015:SSM,
  author =       "Jared Pager and Reiley Jeyapaul and Aviral
                 Shrivastava",
  title =        "A Software Scheme for Multithreading on {CGRAs}",
  journal =      j-TECS,
  volume =       "14",
  number =       "1",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2638558",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jan 22 06:25:23 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Recent industry trends show a drastic rise in the use
                 of hand-held embedded devices, from everyday
                 applications to medical (e.g., monitoring devices) and
                 critical defense applications (e.g., sensor nodes). The
                 two key requirements in the design of such devices are
                 their processing capabilities and battery life. There
                 is therefore an urgency to build high-performance and
                 power-efficient embedded devices, inspiring researchers
                 to develop novel system designs for the same. The use
                 of a coprocessor (application-specific hardware) to
                 offload power-hungry computations is gaining favor
                 among system designers to suit their power budgets. We
                 propose the use of CGRAs (Coarse-Grained Reconfigurable
                 Arrays) as a power-efficient coprocessor. Though CGRAs
                 have been widely used for streaming applications, the
                 extensive compiler support required limits its
                 applicability and use as a general purpose coprocessor.
                 In addition, a CGRA structure can efficiently execute
                 only one statically scheduled kernel at a time, which
                 is a serious limitation when used as an accelerator to
                 a multithreaded or multitasking processor. In this
                 work, we envision a multithreaded CGRA where multiple
                 schedules (or kernels) can be executed simultaneously
                 on the CGRA (as a coprocessor). We propose a
                 comprehensive software scheme that transforms the
                 traditionally single-threaded CGRA into a multithreaded
                 coprocessor to be used as a power-efficient accelerator
                 for multithreaded embedded processors. Our software
                 scheme includes (1) a compiler framework that
                 integrates with existing CGRA mapping techniques to
                 prepare kernels for execution on the multithreaded CGRA
                 and (2) a runtime mechanism that dynamically schedules
                 multiple kernels (offloaded from the processor) to
                 execute simultaneously on the CGRA coprocessor. Our
                 multithreaded CGRA coprocessor implementation thus
                 makes it possible to achieve improved power-efficient
                 computing in modern multithreaded embedded systems.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Perez:2015:ECR,
  author =       "J. F. P{\'e}rez and G. Casale and S. Pacheco-Sanchez",
  title =        "Estimating Computational Requirements in
                 Multi-Threaded Applications",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "41",
  number =       "3",
  pages =        "264--278",
  month =        mar,
  year =         "2015",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2014.2363472",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Feb 1 19:49:24 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6926798",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Article{Porter:2015:MMS,
  author =       "Leo Porter and Michael A. Laurenzano and Ananta Tiwari
                 and Adam Jundt and William A. {Ward, Jr.} and Roy
                 Campbell and Laura Carrington",
  title =        "Making the Most of {SMT} in {HPC}: System- and
                 Application-Level Perspectives",
  journal =      j-TACO,
  volume =       "11",
  number =       "4",
  pages =        "59:1--59:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687651",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jan 12 11:38:56 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This work presents an end-to-end methodology for
                 quantifying the performance and power benefits of
                 simultaneous multithreading (SMT) for HPC centers and
                 applies this methodology to a production system and
                 workload. Ultimately, SMT's value system-wide depends
                 on whether users effectively employ SMT at the
                 application level. However, predicting SMT's benefit
                 for HPC applications is challenging; by doubling the
                 number of threads, the application's characteristics
                 may change. This work proposes statistical modeling
                 techniques to predict the speedup SMT confers to HPC
                 applications. This approach, accurate to within 8\%,
                 uses only lightweight, transparent performance monitors
                 collected during a single run of the application.",
  acknowledgement = ack-nhfb,
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Porter:2015:PFG,
  author =       "Donald E. Porter and Michael D. Bond and Indrajit Roy
                 and Kathryn S. Mckinley and Emmett Witchel",
  title =        "Practical Fine-Grained Information Flow Control Using
                 {Laminar}",
  journal =      j-TOPLAS,
  volume =       "37",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2015",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2638548",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Wed Jan 21 07:13:17 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Decentralized Information Flow Control (DIFC) is a
                 promising model for writing programs with powerful,
                 end-to-end security guarantees. Current DIFC systems
                 that run on commodity hardware can be broadly
                 categorized into two types: language-level and
                 operating system-level DIFC. Language solutions provide
                 no guarantees against security violations on system
                 resources such as files and sockets. Operating system
                 solutions mediate accesses to system resources but are
                 either inefficient or imprecise at monitoring the flow
                 of information through fine-grained program data
                 structures. This article describes Laminar, the first
                 system to implement DIFC using a unified set of
                 abstractions for OS resources and heap-allocated
                 objects. Programmers express security policies by
                 labeling data with secrecy and integrity labels and
                 access the labeled data in security methods. Laminar
                 enforces the security policies specified by the labels
                 at runtime. Laminar is implemented using a modified
                 Java virtual machine and a new Linux security module.
                 This article shows that security methods ease
                 incremental deployment and limit dynamic security
                 checks by retrofitting DIFC policies on four
                 application case studies. Replacing the applications'
                 ad hoc security policies changes less than 10\% of the
                 code and incurs performance overheads from 5\% to 56\%.
                 Compared to prior DIFC systems, Laminar supports a more
                 general class of multithreaded DIFC programs
                 efficiently and integrates language and OS
                 abstractions.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Rodrigues:2015:DSE,
  author =       "Rance Rodrigues and Israel Koren and Sandip Kundu",
  title =        "Does the Sharing of Execution Units Improve
                 Performance\slash Power of Multicores?",
  journal =      j-TECS,
  volume =       "14",
  number =       "1",
  pages =        "17:1--17:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2680543",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jan 22 06:25:23 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Several studies and recent real-world designs have
                 promoted sharing of underutilized resources between
                 cores in a multicore processor to achieve better
                 performance/power. It has been argued that when
                 utilization of such resources is low, sharing has a
                 negligible impact on performance while offering
                 considerable area and power benefits. In this article,
                 we investigate the performance and performance/watt
                 implications of sharing large and underutilized
                 resources between pairs of cores in a multicore. We
                 first study sharing of the entire floating-point
                 datapath (including reservation stations and execution
                 units) by two cores, similar to AMD's Bulldozer. We
                 find that while this architecture results in power
                 savings for certain workload combinations, it also
                 results in significant performance loss of up to 28\%.
                 Next, we study an alternative sharing architecture
                 where only the floating-point execution units are
                 shared, while the individual cores retain their
                 reservation stations. This reduces the highest
                 performance loss to 14\%. We then extend the study to
                 include sharing of other large execution units that are
                 used infrequently, namely, the integer multiply and
                 divide units. Subsequently, we analyze the impact of
                 sharing hardware resources in Simultaneously
                 Multithreaded (SMT) processors where multiple threads
                 run concurrently on the same core. We observe that
                 sharing improves performance/watt at a negligible
                 performance cost only if the shared units have high
                 throughput. Sharing low-throughput units reduces both
                 performance and performance/watt. To increase the
                 throughput of the shared units, we propose the use of
                 Dynamic Voltage and Frequency Boosting (DVFB) of only
                 the shared units that can be placed on a separate
                 voltage island. Our results indicate that the use of
                 DVFB improves both performance and performance/watt by
                 as much as 22\% and 10\%, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Saillard:2015:SDV,
  author =       "Emmanuelle Saillard and Patrick Carribault and Denis
                 Barthou",
  title =        "Static\slash dynamic validation of {MPI} collective
                 communications in multi-threaded context",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "279--280",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scientific applications mainly rely on the MPI
                 parallel programming model to reach high performance on
                 supercomputers. The advent of manycore architectures
                 (larger number of cores and lower amount of memory per
                 core) leads to mix MPI with a thread-based model like
                 OpenMP. But integrating two different programming
                 models inside the same application can be tricky and
                 generate complex bugs. Thus, the correctness of hybrid
                 programs requires a special care regarding MPI calls
                 location. For example, identical MPI collective
                 operations cannot be performed by multiple
                 non-synchronized threads. To tackle this issue, this
                 paper proposes a static analysis and a reduced dynamic
                 instrumentation to detect bugs related to misuse of MPI
                 collective operations inside or outside threaded
                 regions. This work extends PARCOACH designed for
                 MPI-only applications and keeps the compatibility with
                 these algorithms. We validated our method on multiple
                 hybrid benchmarks and applications with a low
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Samak:2015:SRT,
  author =       "Malavika Samak and Murali Krishna Ramanathan and
                 Suresh Jagannathan",
  title =        "Synthesizing racy tests",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "175--185",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Subtle concurrency errors in multithreaded libraries
                 that arise because of incorrect or inadequate
                 synchronization are often difficult to pinpoint
                 precisely using only static techniques. On the other
                 hand, the effectiveness of dynamic race detectors is
                 critically dependent on multithreaded test suites whose
                 execution can be used to identify and trigger races.
                 Usually, such multithreaded tests need to invoke a
                 specific combination of methods with objects involved
                 in the invocations being shared appropriately to expose
                 a race. Without a priori knowledge of the race,
                 construction of such tests can be challenging. In this
                 paper, we present a lightweight and scalable technique
                 for synthesizing precisely these kinds of tests. Given
                 a multithreaded library and a sequential test suite, we
                 describe a fully automated analysis that examines
                 sequential execution traces, and produces as its output
                 a concurrent client program that drives shared objects
                 via library method calls to states conducive for
                 triggering a race. Experimental results on a variety of
                 well-tested Java libraries yield 101 synthesized
                 multithreaded tests in less than four minutes.
                 Analyzing the execution of these tests using an
                 off-the-shelf race detector reveals 187 harmful races,
                 including several previously unreported ones. Our
                 implementation, named NARADA, and the results of our
                 experiments are available at
                 http://www.csa.iisc.ernet.in/~sss/tools/narada.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Schweitzer:2015:PEM,
  author =       "P. Schweitzer and S. Cipi{\`e}re and A. Dufaure and H.
                 Payno and Y. Perrot and D. R. C. Hill and L. Maigne",
  title =        "Performance Evaluation of Multithreaded {Geant4}
                 Simulations Using an {Intel Xeon Phi} Cluster",
  journal =      j-SCI-PROG,
  volume =       "2015",
  number =       "??",
  pages =        "980752:1--980752:10",
  month =        "????",
  year =         "2015",
  CODEN =        "SCIPEV",
  DOI =          "https://doi.org/10.1155/2015/980752",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  ISSN-L =       "1058-9244",
  bibdate =      "Tue Sep 20 07:53:44 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sciprogram.bib",
  URL =          "https://www.hindawi.com/journals/sp/2015/980752/",
  acknowledgement = ack-nhfb,
  fjournal =     "Scientific Programming",
  journal-URL =  "https://www.hindawi.com/journals/sp/",
  journalabr =   "Sci. Prog",
}

@Article{Shi:2015:CLM,
  author =       "Qingchuan Shi and Henry Hoffmann and Omer Khan",
  title =        "A Cross-Layer Multicore Architecture to Tradeoff
                 Program Accuracy and Resilience Overheads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "85--89",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2365204",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To protect multicores from soft-error perturbations,
                 resiliency schemes have been developed with high
                 coverage but high power/performance overheads (similar
                 to 2x). We observe that not all soft-errors affect
                 program correctness, some soft-errors only affect
                 program accuracy, i.e., the program completes with
                 certain acceptable deviations from soft-error free
                 outcome. Thus, it is practical to improve processor
                 efficiency by trading off resilience overheads with
                 program accuracy. We propose the idea of declarative
                 resilience that selectively applies resilience schemes
                 to both crucial and non-crucial code, while ensuring
                 program correctness. At the application level, crucial
                 and non-crucial code is identified based on its impact
                 on the program outcome. The hardware collaborates with
                 software support to enable efficient resilience with
                 100 percent soft-error coverage. Only program accuracy
                 is compromised in the worst-case scenario of a
                 soft-error strike during non-crucial code execution.
                 For a set of multithreaded benchmarks, declarative
                 resilience improves completion time by an average of 21
                 percent over state-of-the-art hardware resilience
                 scheme that protects all executed code. Its performance
                 overhead is similar to 1.38x over a multicore that does
                 not support resilience.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect
                 \& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan;
                 Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn,
                 Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago,
                 Dept Comp Sci, Chicago, IL 60637 USA.",
  author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu
                 khan@uconn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "multicores; program accuracy; Resilience;
                 soft-errors",
  number-of-cited-references = "23",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Shi:2015:CLM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Won:2015:MMC,
  author =       "Youjip Won and Kyeongyeol Lim and Jaehong Min",
  title =        "{MUCH}: Multithreaded Content-Based File Chunking",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "64",
  number =       "5",
  pages =        "1375--1388",
  month =        "????",
  year =         "2015",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2014.2322600",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Jun 4 19:46:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Zhang:2015:DMB,
  author =       "Junchao Zhang and Babak Behzad and Marc Snir",
  title =        "Design of a Multithreaded {Barnes--Hut} Algorithm for
                 Multicore Clusters",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "26",
  number =       "7",
  pages =        "1861--1873",
  month =        jul,
  year =         "2015",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2014.2331243",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Mon Aug 3 11:58:51 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.computer.org/csdl/trans/td/2015/07/06837521-abs.html",
  abstract-URL = "http://www.computer.org/csdl/trans/td/2015/07/06837521-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/tpds/archives.htm",
}

@Article{Zhang:2015:DPO,
  author =       "Naling Zhang and Markus Kusano and Chao Wang",
  title =        "Dynamic partial order reduction for relaxed memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "250--259",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737956",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Under a relaxed memory model such as TSO or PSO, a
                 concurrent program running on a shared-memory
                 multiprocessor may observe two types of nondeterminism:
                 the nondeterminism in thread scheduling and the
                 nondeterminism in store buffering. Although there is a
                 large body of work on mitigating the scheduling
                 nondeterminism during runtime verification, methods for
                 soundly mitigating the store buffering nondeterminism
                 are lacking. We propose a new dynamic partial order
                 reduction (POR) algorithm for verifying concurrent
                 programs under TSO and PSO. Our method relies on
                 modeling both types of nondeterminism in a unified
                 framework, which allows us to extend existing POR
                 techniques to TSO and PSO without overhauling the
                 verification algorithm. In addition to sound POR, we
                 also propose a buffer-bounding method for more
                 aggressively reducing the state space. We have
                 implemented our new methods in a stateless model
                 checking tool and demonstrated their effectiveness on a
                 set of multithreaded C benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Zhang:2015:LOS,
  author =       "Minjia Zhang and Jipeng Huang and Man Cao and Michael
                 D. Bond",
  title =        "Low-overhead software transactional memory with
                 progress guarantees and strong semantics",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "97--108",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software transactional memory offers an appealing
                 alternative to locks by improving programmability,
                 reliability, and scalability. However, existing STMs
                 are impractical because they add high instrumentation
                 costs and often provide weak progress guarantees and/or
                 semantics. This paper introduces a novel STM called
                 LarkTM that provides three significant features. (1)
                 Its instrumentation adds low overhead except when
                 accesses actually conflict, enabling low single-thread
                 overhead and scaling well on low-contention workloads.
                 (2) It uses eager concurrency control mechanisms, yet
                 naturally supports flexible conflict resolution,
                 enabling strong progress guarantees. (3) It naturally
                 provides strong atomicity semantics at low cost.
                 LarkTM's design works well for low-contention
                 workloads, but adds significant overhead under higher
                 contention, so we design an adaptive version of LarkTM
                 that uses alternative concurrency control for
                 high-contention objects. An implementation and
                 evaluation in a Java virtual machine show that the
                 basic and adaptive versions of LarkTM not only provide
                 low single-thread overhead, but their multithreaded
                 performance compares favorably with existing
                 high-performance STMs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Zheng:2015:ACC,
  author =       "Zhong Zheng and Zhiying Wang and Mikko Lipasti",
  title =        "Adaptive Cache and Concurrency Allocation on
                 {GPGPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "90--93",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2359882",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Memory bandwidth is critical to GPGPU performance.
                 Exploiting locality in caches can better utilize memory
                 bandwidth. However, memory requests issued by excessive
                 threads cause cache thrashing and saturate memory
                 bandwidth, degrading performance. In this paper, we
                 propose adaptive cache and concurrency allocation (CCA)
                 to prevent cache thrashing and improve the utilization
                 of bandwidth and computational resources, hence
                 improving performance. According to locality and reuse
                 distance of access patterns in GPGPU program, warps on
                 a stream multiprocessor are dynamically divided into
                 three groups: cached, bypassed, and waiting. The data
                 cache accommodates the footprint of cached warps.
                 Bypassed warps cannot allocate cache lines in the data
                 cache to prevent cache thrashing, but are able to take
                 advantage of available memory bandwidth and
                 computational resource. Waiting warps are de-scheduled.
                 Experimental results show that adaptive CCA can
                 significant improve benchmark performance, with 80
                 percent harmonic mean IPC improvement over the
                 baseline.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zheng, Z (Reprint Author), Natl Univ Def Technol,
                 State Key Lab High Performance Comp, Changsha, Hunan,
                 Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ
                 Def Technol, State Key Lab High Performance Comp,
                 Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang,
                 Zhiying, Natl Univ Def Technol, Sch Comp, Changsha,
                 Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin,
                 Dept Elect \& Comp Engn, Madison, WI 54706 USA.",
  author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn
                 mikko@engr.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC
                 [61070037, 61272143, 61272144, 61103016, 61202121];
                 NUDT [B120607]; RFDP [20114307120013]; NSF
                 [CCF-1318298]",
  funding-text = "This work was partially supported by CSC, 863 Program
                 (2012AA010905), NSFC (61070037, 61272143, 61272144,
                 61103016, 61202121), NUDT(B120607), RFDP
                 (20114307120013), and NSF (CCF-1318298).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access patterns; adaptive cache-and-concurrency
                 allocation; Bandwidth; bandwidth utilization
                 improvement; benchmark performance improvement;
                 Benchmark testing; bypassed warps; cache; cache lines;
                 cache locality; Cache memory; cache storage; cache
                 thrashing prevention; cached warps; CCA; computational
                 resource utilization improvement; concurrency;
                 concurrency control; Concurrent computing; GPGPU; GPGPU
                 performance improvement; graphics processing units;
                 harmonic mean IPC improvement; Instruction sets; memory
                 bandwidth saturation; multi-threading; multiprocessing
                 systems; performance evaluation; Resource management;
                 reuse distance; stream multiprocessor; waiting warp
                 descheduling",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Zheng:2015:ACC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Altiparmak:2016:MMF,
  author =       "N. Altiparmak and A. S. Tosun",
  title =        "Multithreaded Maximum Flow Based Optimal Replica
                 Selection Algorithm for Heterogeneous Storage
                 Architectures",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "65",
  number =       "5",
  pages =        "1543--1557",
  month =        may,
  year =         "2016",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2015.2451620",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Fri Apr 15 13:39:43 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Arjomand:2016:BAP,
  author =       "Mohammad Arjomand and Mahmut T. Kandemir and Anand
                 Sivasubramaniam and Chita R. Das",
  title =        "Boosting access parallelism to {PCM}-based main
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "695--706",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001211",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Despite its promise as a DRAM main memory replacement,
                 Phase Change Memory (PCM) has high write latencies
                 which can be a serious detriment to its widespread
                 adoption. Apart from slowing down a write request, the
                 consequent high latency can also keep other chips of
                 the same rank, that are not involved in this write,
                 idle for long times. There are several practical
                 considerations that make it difficult to allow
                 subsequent reads and/or writes to be served
                 concurrently from the same chips during the long
                 latency write. This paper proposes and evaluates
                 several novel mechanisms --- re-constructing data from
                 error correction bits instead of waiting for chips
                 currently busy to serve a read, rotating word mappings
                 across chips of a PCM rank, and rotating the mapping of
                 error detection/correction bits across these chips ---
                 to overlap several reads with an ongoing write (RoW)
                 and even a write with an ongoing write (WoW). The paper
                 also presents the necessary micro-architectural
                 enhancements needed to implement these mechanisms,
                 without significantly changing the current interfaces.
                 The resulting PCM access parallelism (PCMap) system
                 incorporating these enhancements, boosts the
                 intra-rank-level parallelism during such writes from a
                 very low baseline value of 2.4 to an average and
                 maximum values of 4.5 and 7.4, respectively (out of a
                 maximum of 8.0), across a wide spectrum of both
                 multiprogrammed and multithreaded workloads. This boost
                 in parallelism results in an average IPC improvement of
                 15.6\% and 16.7\% for the multiprogrammed and
                 multithreaded workloads, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Badamo:2016:IPE,
  author =       "Michael Badamo and Jeff Casarona and Minshu Zhao and
                 Donald Yeung",
  title =        "Identifying Power-Efficient Multicore Cache
                 Hierarchies via Reuse Distance Analysis",
  journal =      j-TOCS,
  volume =       "34",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2851503",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "To enable performance improvements in a
                 power-efficient manner, computer architects have been
                 building CPUs that exploit greater amounts of
                 thread-level parallelism. A key consideration in such
                 CPUs is properly designing the on-chip cache hierarchy.
                 Unfortunately, this can be hard to do, especially for
                 CPUs with high core counts and large amounts of cache.
                 The enormous design space formed by the combinatorial
                 number of ways in which to organize the cache hierarchy
                 makes it difficult to identify power-efficient
                 configurations. Moreover, the problem is exacerbated by
                 the slow speed of architectural simulation, which is
                 the primary means for conducting such design space
                 studies. A powerful tool that can help architects
                 optimize CPU cache hierarchies is reuse distance (RD)
                 analysis. Recent work has extended uniprocessor RD
                 techniques-i.e., by introducing concurrent RD and
                 private-stack RD profiling-to enable analysis of
                 different types of caches in multicore CPUs. Once
                 acquired, parallel locality profiles can predict the
                 performance of numerous cache configurations,
                 permitting highly efficient design space exploration.
                 To date, existing work on multicore RD analysis has
                 focused on developing the profiling techniques and
                 assessing their accuracy. Unfortunately, there has been
                 no work on using RD analysis to optimize CPU
                 performance or power consumption. This article
                 investigates applying multicore RD analysis to identify
                 the most power efficient cache configurations for a
                 multicore CPU. First, we develop analytical models that
                 use the cache-miss counts from parallel locality
                 profiles to estimate CPU performance and power
                 consumption. Although future scalable CPUs will likely
                 employ multithreaded (and even out-of-order) cores, our
                 current study assumes single-threaded in-order cores to
                 simplify the models, allowing us to focus on the cache
                 hierarchy and our RD-based techniques. Second, to
                 demonstrate the utility of our techniques, we apply our
                 models to optimize a large-scale tiled CPU architecture
                 with a two-level cache hierarchy. We show that the most
                 power efficient configuration varies considerably
                 across different benchmarks, and that our locality
                 profiles provide deep insights into why certain
                 configurations are power efficient. We also show that
                 picking the best configuration can provide significant
                 gains, as there is a 2.01x power efficiency spread
                 across our tiled CPU design space. Finally, we validate
                 the accuracy of our techniques using detailed
                 simulation. Among several simulated configurations, our
                 techniques can usually pick the most power efficient
                 configuration, or one that is very close to the best.
                 In addition, across all simulated configurations, we
                 can predict power efficiency with 15.2\% error.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Balkind:2016:OOS,
  author =       "Jonathan Balkind and Michael McKeown and Yaosheng Fu
                 and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and
                 Mohammad Shahrad and Adi Fuchs and Samuel Payne and
                 Xiaohua Liang and Matthew Matl and David Wentzlaff",
  title =        "{OpenPiton}: an Open Source Manycore Research
                 Framework",
  journal =      j-OPER-SYS-REV,
  volume =       "50",
  number =       "2",
  pages =        "217--232",
  month =        jun,
  year =         "2016",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/2954680.2872414",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Jun 9 17:03:34 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/opersysrev.bib",
  abstract =     "Industry is building larger, more complex, manycore
                 processors on the back of strong institutional
                 knowledge, but academic projects face difficulties in
                 replicating that scale. To alleviate these difficulties
                 and to develop and share knowledge, the community needs
                 open architecture frameworks for simulation, synthesis,
                 and software exploration which support extensibility,
                 scalability, and configurability, alongside an
                 established base of verification tools and supported
                 software. In this paper we present OpenPiton, an open
                 source framework for building scalable architecture
                 research prototypes from 1 core to 500 million cores.
                 OpenPiton is the world's first open source,
                 general-purpose, multithreaded manycore processor and
                 framework. OpenPiton leverages the industry hardened
                 OpenSPARC T1 core with modifications and builds upon it
                 with a scratch-built, scalable uncore creating a
                 flexible, modern manycore design. In addition,
                 OpenPiton provides synthesis and backend scripts for
                 ASIC and FPGA to enable other researchers to bring
                 their designs to implementation. OpenPiton provides a
                 complete verification infrastructure of over 8000
                 tests, is supported by mature software tools, runs
                 full-stack multiuser Debian Linux, and is written in
                 industry standard Verilog. Multiple implementations of
                 OpenPiton have been created including a taped-out
                 25-core implementation in IBM's 32nm process and
                 multiple Xilinx FPGA prototypes.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Cao:2016:DBG,
  author =       "Man Cao and Minjia Zhang and Aritra Sengupta and
                 Michael D. Bond",
  title =        "Drinking from both glasses: combining pessimistic and
                 optimistic tracking of cross-thread dependences",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851143",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is notoriously challenging to develop parallel
                 software systems that are both scalable and correct.
                 Runtime support for parallelism---such as multithreaded
                 record {\&} replay, data race detectors, transactional
                 memory, and enforcement of stronger memory
                 models---helps achieve these goals, but existing
                 commodity solutions slow programs substantially in
                 order to track (i.e., detect or control) an execution's
                 cross-thread dependences accurately. Prior work tracks
                 cross-thread dependences either ``pessimistically,''
                 slowing every program access, or ``optimistically,''
                 allowing for lightweight instrumentation of most
                 accesses but dramatically slowing accesses involved in
                 cross-thread dependences. This paper seeks to hybridize
                 pessimistic and optimistic tracking, which is
                 challenging because there exists a fundamental mismatch
                 between pessimistic and optimistic tracking. We address
                 this challenge based on insights about how dependence
                 tracking and program synchronization interact, and
                 introduce a novel approach called hybrid tracking.
                 Hybrid tracking is suitable for building efficient
                 runtime support, which we demonstrate by building
                 hybrid-tracking-based versions of a dependence recorder
                 and a region serializability enforcer. An adaptive,
                 profile-based policy makes runtime decisions about
                 switching between pessimistic and optimistic tracking.
                 Our evaluation shows that hybrid tracking enables
                 runtime support to overcome the performance limitations
                 of both pessimistic and optimistic tracking alone.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Chen:2016:TMR,
  author =       "Kuan-Hsun Chen and Jian-Jia Chen and Florian Kriebel
                 and Semeen Rehman and Muhammad Shafique and J{\"o}rg
                 Henkel",
  title =        "Task Mapping for Redundant Multithreading in
                 Multi-Cores with Reliability and Performance
                 Heterogeneity",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "65",
  number =       "11",
  pages =        "3441--3455",
  month =        nov,
  year =         "2016",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2016.2532862",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Oct 11 05:14:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Creech:2016:TSS,
  author =       "Timothy Creech and Rajeev Barua",
  title =        "Transparently Space Sharing a Multicore Among Multiple
                 Processes",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001910",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As hardware becomes increasingly parallel and the
                 availability of scalable parallel software improves,
                 the problem of managing multiple multithreaded
                 applications (processes) becomes important. Malleable
                 processes, which can vary the number of threads used as
                 they run, enable sophisticated and flexible resource
                 management. Although many existing applications
                 parallelized for SMPs with parallel runtimes are in
                 fact already malleable, deployed runtime environments
                 provide no interface nor any strategy for intelligently
                 allocating hardware threads or even preventing
                 oversubscription. Prior research methods either depend
                 on profiling applications ahead of time to make good
                 decisions about allocations or do not account for
                 process efficiency at all, leading to poor performance.
                 None of these prior methods have been adapted widely in
                 practice. This article presents the Scheduling and
                 Allocation with Feedback (SCAF) system: a drop-in
                 runtime solution that supports existing malleable
                 applications in making intelligent allocation decisions
                 based on observed efficiency without any changes to
                 semantics, program modification, offline profiling, or
                 even recompilation. Our existing implementation can
                 control most unmodified OpenMP applications. Other
                 malleable threading libraries can also easily be
                 supported with small modifications without requiring
                 application modification or recompilation. In this
                 work, we present the SCAF daemon and a SCAF-aware port
                 of the GNU OpenMP runtime. We present a new technique
                 for estimating process efficiency purely at runtime
                 using available hardware counters and demonstrate its
                 effectiveness in aiding allocation decisions. We
                 evaluated SCAF using NAS NPB parallel benchmarks on
                 five commodity parallel platforms, enumerating
                 architectural features and their effects on our scheme.
                 We measured the benefit of SCAF in terms of sum of
                 speedups improvement (a common metric for
                 multiprogrammed environments) when running all
                 benchmark pairs concurrently compared to
                 equipartitioning-the best existing competing scheme in
                 the literature. We found that SCAF improves on
                 equipartitioning on four out of five machines, showing
                 a mean improvement factor in sum of speedups of 1.04 to
                 1.11x for benchmark pairs, depending on the machine,
                 and 1.09x on average. Since we are not aware of any
                 widely available tool for equipartitioning, we also
                 compare SCAF against multiprogramming using unmodified
                 OpenMP, which is the only environment available to end
                 users today. SCAF improves on the unmodified OpenMP
                 runtimes for all five machines, with a mean improvement
                 of 1.08 to 2.07x, depending on the machine, and 1.59x
                 on average.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Daloze:2016:ETS,
  author =       "Benoit Daloze and Stefan Marr and Daniele Bonetta and
                 Hanspeter M{\"o}ssenb{\"o}ck",
  title =        "Efficient and thread-safe objects for
                 dynamically-typed languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "642--659",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We are in the multi-core era. Dynamically-typed
                 languages are in widespread use, but their support for
                 multithreading still lags behind. One of the reasons is
                 that the sophisticated techniques they use to
                 efficiently represent their dynamic object models are
                 often unsafe in multithreaded environments. This paper
                 defines safety requirements for dynamic object models
                 in multithreaded environments. Based on these
                 requirements, a language-agnostic and thread-safe
                 object model is designed that maintains the efficiency
                 of sequential approaches. This is achieved by ensuring
                 that field reads do not require synchronization and
                 field updates only need to synchronize on objects
                 shared between threads. Basing our work on
                 JRuby+Truffle, we show that our safe object model has
                 zero overhead on peak performance for thread-local
                 objects and only 3\% average overhead on parallel
                 benchmarks where field updates require synchronization.
                 Thus, it can be a foundation for safe and efficient
                 multithreaded VMs for a wide range of dynamic
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Deniz:2016:UML,
  author =       "Etem Deniz and Alper Sen",
  title =        "Using Machine Learning Techniques to Detect Parallel
                 Patterns of Multi-threaded Applications",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "44",
  number =       "4",
  pages =        "867--900",
  month =        aug,
  year =         "2016",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-015-0396-z",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Tue Sep 20 10:50:00 MDT 2016",
  bibsource =    "http://link.springer.com/journal/10766/44/4;
                 https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-015-0396-z",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Denniston:2016:DH,
  author =       "Tyler Denniston and Shoaib Kamil and Saman
                 Amarasinghe",
  title =        "Distributed {Halide}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851157",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many image processing tasks are naturally expressed as
                 a pipeline of small computational kernels known as
                 stencils. Halide is a popular domain-specific language
                 and compiler designed to implement image processing
                 algorithms. Halide uses simple language constructs to
                 express what to compute and a separate scheduling
                 co-language for expressing when and where to perform
                 the computation. This approach has demonstrated
                 performance comparable to or better than hand-optimized
                 code. Until now, however, Halide has been restricted to
                 parallel shared memory execution, limiting its
                 performance for memory-bandwidth-bound pipelines or
                 large-scale image processing tasks. We present an
                 extension to Halide to support distributed-memory
                 parallel execution of complex stencil pipelines. These
                 extensions compose with the existing scheduling
                 constructs in Halide, allowing expression of complex
                 computation and communication strategies. Existing
                 Halide applications can be distributed with minimal
                 changes, allowing programmers to explore the tradeoff
                 between recomputation and communication with little
                 effort. Approximately 10 new of lines code are needed
                 even for a 200 line, 99 stage application. On nine
                 image processing benchmarks, our extensions give up to
                 a 1.4$ \times $ speedup on a single node over regular
                 multithreaded execution with the same number of cores,
                 by mitigating the effects of non-uniform memory access.
                 The distributed benchmarks achieve up to 18$ \times $
                 speedup on a 16 node testing machine and up to 57$
                 \times $ speedup on 64 nodes of the NERSC Cori
                 supercomputer.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Diavastos:2016:ITD,
  author =       "Andreas Diavastos and Pedro Trancoso and Mikel
                 Luj{\'a}n and Ian Watson",
  title =        "Integrating Transactions into the Data-Driven
                 Multi-threading Model Using the {TFlux} Platform",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "44",
  number =       "2",
  pages =        "257--277",
  month =        apr,
  year =         "2016",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-015-0369-2",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Thu Apr 7 12:08:24 MDT 2016",
  bibsource =    "http://link.springer.com/journal/10766/44/2;
                 https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s10766-015-0369-2",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Dublish:2016:CCG,
  author =       "Saumay Dublish and Vijay Nagarajan and Nigel Topham",
  title =        "Cooperative Caching for {GPUs}",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001589",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The rise of general-purpose computing on GPUs has
                 influenced architectural innovation on them. The
                 introduction of an on-chip cache hierarchy is one such
                 innovation. High L1 miss rates on GPUs, however,
                 indicate inefficient cache usage due to myriad factors,
                 such as cache thrashing and extensive multithreading.
                 Such high L1 miss rates in turn place high demands on
                 the shared L2 bandwidth. Extensive congestion in the L2
                 access path therefore results in high memory access
                 latencies. In memory-intensive applications, these
                 latencies get exposed due to a lack of active compute
                 threads to mask such high latencies. In this article,
                 we aim to reduce the pressure on the shared L2
                 bandwidth, thereby reducing the memory access latencies
                 that lie in the critical path. We identify significant
                 replication of data among private L1 caches, presenting
                 an opportunity to reuse data among L1s. We further show
                 how this reuse can be exploited via an L1 Cooperative
                 Caching Network (CCN), thereby reducing the bandwidth
                 demand on L2. In the proposed architecture, we connect
                 the L1 caches with a lightweight ring network to
                 facilitate intercore communication of shared data. We
                 show that this technique reduces traffic to the L2
                 cache by an average of 29\%, freeing up the bandwidth
                 for other accesses. We also show that the CCN reduces
                 the average memory latency by 24\%, thereby reducing
                 core stall cycles by 26\% on average. This translates
                 into an overall performance improvement of 14.7\% on
                 average (and up to 49\%) for applications that exhibit
                 reuse across L1 caches. In doing so, the CCN incurs a
                 nominal area and energy overhead of 1.3\% and 2.5\%,
                 respectively. Notably, the performance improvement with
                 our proposed CCN compares favorably to the performance
                 improvement achieved by simply doubling the number of
                 L2 banks by up to 34\%.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Evtyushkin:2016:UMC,
  author =       "Dmitry Evtyushkin and Dmitry Ponomarev and Nael
                 Abu-Ghazaleh",
  title =        "Understanding and Mitigating Covert Channels Through
                 Branch Predictors",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2870636",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Covert channels through shared processor resources
                 provide secret communication between two malicious
                 processes: the trojan and the spy. In this article, we
                 classify, analyze, and compare covert channels through
                 dynamic branch prediction units in modern processors.
                 Through experiments on a real hardware platform, we
                 compare contention-based channel and the channel that
                 is based on exploiting the branch predictor's residual
                 state. We analyze these channels in SMT and
                 single-threaded environments under both clean and noisy
                 conditions. Our results show that the residual
                 state-based channel provides a cleaner signal and is
                 effective even in noisy execution environments with
                 another application sharing the same physical core with
                 the trojan and the spy. We also estimate the capacity
                 of the branch predictor covert channels and describe a
                 software-only mitigation technique that is based on
                 randomizing the state of the predictor tables on
                 context switches. We show that this protection
                 eliminates all covert channels through the branch
                 prediction unit with minimal impact on performance.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Feliu:2016:BAL,
  author =       "J. Feliu and J. Sahuquillo and S. Petit and J. Duato",
  title =        "Bandwidth-Aware On-Line Scheduling in {SMT}
                 Multicores",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "65",
  number =       "2",
  pages =        "422--434",
  month =        "????",
  year =         "2016",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2015.2428694",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Jan 19 07:06:51 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Harish:2016:PIK,
  author =       "Pawan Harish and Mentar Mahmudi and Beno{\^\i}t {Le
                 Callennec} and Ronan Boulic",
  title =        "Parallel Inverse Kinematics for Multithreaded
                 Architectures",
  journal =      j-TOG,
  volume =       "35",
  number =       "2",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2016",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/2887740",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Mon Jun 20 09:13:19 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tog/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "In this article, we present a parallel prioritized
                 Jacobian-based inverse kinematics algorithm for
                 multithreaded architectures. We solve damped least
                 squares inverse kinematics using a parallel line search
                 by identifying and sampling critical input parameters.
                 Parallel competing execution paths are spawned for each
                 parameter in order to select the optimum that minimizes
                 the error criteria. Our algorithm is highly scalable
                 and can handle complex articulated bodies at
                 interactive frame rates. We show results on complex
                 skeletons consisting of more than 600 degrees of
                 freedom while being controlled using multiple end
                 effectors. We implement the algorithm both on multicore
                 and GPU architectures and demonstrate how the GPU can
                 further exploit fine-grain parallelism not directly
                 available on a multicore processor. Our implementations
                 are 10 to 150 times faster compared to a
                 state-of-the-art serial implementation while providing
                 higher accuracy. We also demonstrate the scalability of
                 the algorithm over multiple scenarios and explore the
                 GPU implementation in detail.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Hashemi:2016:EEB,
  author =       "Milad Hashemi and Debbie Marr and Doug Carmean and
                 Yale N. Patt",
  title =        "Efficient Execution of Bursty Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "85--88",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2456013",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The performance of user-facing applications is
                 critical to client platforms. Many of these
                 applications are event-driven and exhibit ``bursty''
                 behavior: the application is generally idle but
                 generates bursts of activity in response to human
                 interaction. We study one example of a bursty
                 application, web-browsers, and produce two important
                 insights: (1) Activity bursts contain false
                 parallelism, bringing many cores out of a deep sleep to
                 inefficiently render a single webpage, and (2) these
                 bursts are highly compute driven, and thus scale nearly
                 linearly with frequency. We show average performance
                 gains/energy reductions of 14\%/17\% respectively on
                 real hardware by statically moving threads from
                 multiple cores to a single core. We then propose
                 dynamic hardware driven thread migration and scheduling
                 enhancements that detect these bursts, leading to
                 further benefits.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hashemi, M (Reprint Author), Univ Texas Austin, Elect
                 \& Comp Engn, Austin, TX 78701 USA. Hashemi, Milad;
                 Patt, Yale N., Univ Texas Austin, Elect \& Comp Engn,
                 Austin, TX 78701 USA. Marr, Debbie, Intel Corp, Intel
                 Labs, Portland, OR USA. Carmean, Doug, Microsoft,
                 Microsoft Res, Seattle, WA USA.",
  author-email = "miladh@hps.utexas.edu debbie.marr@intel.com
                 dcarmean@microsoft.com patt@hps.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Corporation; Cockrell Foundation; HPS
                 Research Group",
  funding-text = "The authors thank Intel Corporation and the Cockrell
                 Foundation for their continued generous financial
                 support of the HPS Research Group.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Browsers; bursty applications; dynamic hardware;
                 Energy; energy reductions; Hardware; human computer
                 interaction; human interaction; Instruction sets;
                 Internet; Loading; multi-threading; Multicore
                 processing; multiple cores; multiprocessing systems;
                 online front-ends; Operating systems; performance;
                 performance evaluation; performance gains; power aware
                 computing; thread migration; thread scheduling;
                 Web-browsers; Webpage; webpages; webpages, thread
                 scheduling",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hashemi:2016:EEB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hu:2016:TDM,
  author =       "Qi Hu and Peng Liu and Michael C. Huang",
  title =        "Threads and Data Mapping: Affinity Analysis for
                 Traffic Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "133--136",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2451172",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Modern processors spend significant amount of time and
                 energy moving data. With the increase in core count,
                 the relative importance of such latency and energy
                 expenditure will only increase with time. Inter-core
                 communication traffic when executing a multithreaded
                 application is one such source of latency and energy
                 expenditure. This traffic is influenced by the mapping
                 of threads and data onto multicore systems. This paper
                 investigates the impact of threads and data mapping on
                 traffic in a chip-multiprocessor, and exploits the
                 potential for traffic reduction through threads and
                 data mapping. Based on the analysis and estimation of
                 the lowest traffic, we propose a threads and data
                 mapping mechanism to approach the lowest traffic. The
                 mapping takes both the correlation among threads and
                 the affinity of data with individual threads into
                 account, and results in significant traffic reduction
                 and energy savings.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, P (Reprint Author), Zhejiang Univ, Coll Informat
                 Sci \& Elect Engn, Hangzhou 310027, Peoples R China.
                 Hu, Qi; Liu, Peng, Zhejiang Univ, Coll Informat Sci \&
                 Elect Engn, Hangzhou 310027, Peoples R China. Huang,
                 Michael C., Univ Rochester, Dept Elect \& Comp Engn,
                 601 Elmwood Ave, Rochester, NY 14627 USA.",
  author-email = "huqi\_isee@zju.edu.cn liupeng@zju.edu.cn
                 michael.huang@rochester.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSFC [61028004]; US National Science
                 Foundation (NSF) [1217662, 1255729]; Open Project
                 Program of the State Key Laboratory of Mathematical
                 Engineering and Advanced Computing [2014A08, 2015A09]",
  funding-text = "This work was supported by NSFC under grant 61028004,
                 and also in part by US National Science Foundation
                 (NSF) under grants 1217662 and 1255729, and the Open
                 Project Program of the State Key Laboratory of
                 Mathematical Engineering and Advanced Computing under
                 grants 2014A08 and 2015A09. P. Liu is the corresponding
                 author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Mapping; memory; multicore; network-on-chip",
  keywords-plus = "NETWORKS; CACHES; CHIP",
  number-of-cited-references = "11",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hu:2016:TDM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Huang:2016:MCR,
  author =       "Shiyou Huang and Jeff Huang",
  title =        "Maximal causality reduction for {TSO} and {PSO}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "447--461",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984025",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Verifying concurrent programs is challenging due to
                 the exponentially large thread interleaving space. The
                 problem is exacerbated by relaxed memory models such as
                 Total Store Order (TSO) and Partial Store Order (PSO)
                 which further explode the interleaving space by
                 reordering instructions. A recent advance, Maximal
                 Causality Reduction (MCR), has shown great promise to
                 improve verification effectiveness by maximally
                 reducing redundant explorations. However, the original
                 MCR only works for the Sequential Consistency (SC)
                 memory model, but not for TSO and PSO. In this paper,
                 we develop novel extensions to MCR by solving two key
                 problems under TSO and PSO: (1) generating
                 interleavings that can reach new states by encoding the
                 operational semantics of TSO and PSO with first-order
                 logical constraints and solving them with SMT solvers,
                 and (2) enforcing TSO and PSO interleavings by
                 developing novel replay algorithms that allow
                 executions out of the program order. We show that our
                 approach successfully enables MCR to effectively
                 explore TSO and PSO interleavings. We have compared our
                 approach with a recent Dynamic Partial Order Reduction
                 (DPOR) algorithm for TSO and PSO and a SAT-based
                 stateless model checking approach. Our results show
                 that our approach is much more effective than the other
                 approaches for both state-space exploration and bug
                 finding --- on average it explores 5-10X fewer
                 executions and finds many bugs that the other tools
                 cannot find.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Huang:2016:PMR,
  author =       "Jeff Huang and Arun K. Rajagopalan",
  title =        "Precise and maximal race detection from incomplete
                 traces",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "462--476",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984024",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present RDIT, a novel dynamic technique to detect
                 data races in multithreaded programs with incomplete
                 trace information, i.e., in the presence of missing
                 events. RDIT is both precise and maximal: it does not
                 report any false alarms and it detects a maximal set of
                 true traces from the observed incomplete trace. RDIT is
                 underpinned by a sound BarrierPair model that abstracts
                 away the missing events by capturing the invocation
                 data of their enclosing methods. By making the least
                 conservative abstraction that a missing method
                 introduces synchronization only when it has a memory
                 address in scope that overlaps with other events or
                 other missing methods, and by formulating maximal
                 thread causality as logical constraints, RDIT
                 guarantees to precisely detect races with maximal
                 capability. RDIT has been applied in seven real-world
                 large concurrent systems and has detected dozens of
                 true races with zero false alarms. Comparatively,
                 existing algorithms such as Happens-Before,
                 Causal-Precedes, and Maximal-Causality which are known
                 to be precise all report many false alarms when missing
                 synchronizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Jiang:2016:TLH,
  author =       "Chuntao Jiang and Zhibin Yu and Lieven Eeckhout and
                 Hai Jin and Xiaofei Liao and Chengzhong Xu",
  title =        "Two-Level Hybrid Sampled Simulation of Multithreaded
                 Applications",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818353",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Sampled microarchitectural simulation of
                 single-threaded applications is mature technology for
                 over a decade now. Sampling multithreaded applications,
                 on the other hand, is much more complicated. Not until
                 very recently have researchers proposed solutions for
                 sampled simulation of multithreaded applications.
                 Time-Based Sampling (TBS) samples multithreaded
                 application execution based on time---not instructions
                 as is typically done for single-threaded
                 applications---yielding estimates for a multithreaded
                 application's execution time. In this article, we
                 revisit and analyze previously proposed TBS approaches
                 (periodic and cantor fractal based sampling), and we
                 obtain a number of novel and surprising insights, such
                 as (i) accurately estimating fast-forwarding IPC, that
                 is, performance in-between sampling units, is more
                 important than accurately estimating sample IPC, that
                 is, performance within the sampling units; (ii)
                 fast-forwarding IPC estimation accuracy is determined
                 by both the sampling unit distribution and how to use
                 the sampling units to predict fast-forwarding IPC; and
                 (iii) cantor sampling is more accurate at small
                 sampling unit sizes, whereas periodic is more accurate
                 at large sampling unit sizes. These insights lead to
                 the development of Two-level Hybrid Sampling (THS), a
                 novel sampling methodology for multithreaded
                 applications that combines periodic sampling's accuracy
                 at large time scales (i.e., uniformly selecting
                 coarse-grain sampling units across the entire program
                 execution) with cantor sampling's accuracy at small
                 time scales (i.e., the ability to accurately predict
                 fast-forwarding IPC in-between small sampling units).
                 The clustered occurrence of small sampling units under
                 cantor sampling also enables shortened warmup and thus
                 enhanced simulation speed. Overall, THS achieves an
                 average absolute execution time prediction error of 4\%
                 while yielding an average simulation speedup of 40 $
                 \times $ compared to detailed simulation, which is both
                 more accurate and faster than the current
                 state-of-the-art. Case studies illustrate THS' ability
                 to accurately predict relative performance differences
                 across the design space.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jung:2016:LPS,
  author =       "Daejin Jung and Sheng Li and Jung Ho Ahn",
  title =        "Large Pages on Steroids: Small Ideas to Accelerate Big
                 Memory Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2495103",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Utilizing small (e.g., 4 KB) pages incurs frequent TLB
                 misses on modern big memory applications, substantially
                 degrading the performance of the system. Large (e.g., 1
                 GB) pages or direct segments can alleviate this penalty
                 due to page table walks, but at the same time such a
                 strategy exposes the organizational and operational
                 details of modern DRAM-based memory systems to
                 applications. Row-buffer conflicts caused by accesses
                 heading to the same DRAM bank but different rows from
                 multiple threads are regarded as the main culprits
                 behind the very large gaps between peak and achieved
                 main memory throughput, but hardware-based approaches
                 in memory controllers have achieved only limited
                 success whereas existing proposals that change memory
                 allocators cannot be applied to large pages or direct
                 segments. In this paper, we propose a set of
                 application-level techniques to improve the effective
                 main memory bandwidth. The techniques stem from the two
                 key observations that (1) each thread of an application
                 exclusively accesses certain datasets for a short or
                 long period of time, and (2) superfluous memory reads
                 originating from a cache's write allocation policy can
                 be avoided if scatters during the data shuffling pass
                 through intermediate cache-friendly buffers.
                 Experiments with a contemporary x86 server show that
                 combining large pages with the proposed address
                 linearization, bank coloring, and write streaming
                 techniques improves the performance of the three big
                 memory applications of high-throughput key-value store,
                 fast-Fourier transform, and radix sort by 37.6, 22.9,
                 and 68.1 percent, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jung, D (Reprint Author), Seoul Natl Univ, Dept
                 Transdisciplinary Studies, Seoul, South Korea. Jung,
                 Daejin; Ahn, Jung Ho, Seoul Natl Univ, Dept
                 Transdisciplinary Studies, Seoul, South Korea. Li,
                 Sheng, Intel Labs, Santa Clara, CA USA. Ahn, Jung Ho,
                 Seoul Natl Univ, Big Data Inst, Seoul, South Korea.",
  author-email = "haidj@snu.ac.kr sheng.r.li@intel.com gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea -
                 Korea government [NRF-2014R1A2A1A11052936,
                 NRF-2012M3A9D1054622]",
  funding-text = "The authors thank Jongwook Chung and Jaeyoon Choi on
                 their contributions to application writing and
                 experiments. This work was partially supported by the
                 National Research Foundation of Korea grant funded by
                 the Korea government (NRF-2014R1A2A1A11052936 and
                 NRF-2012M3A9D1054622). Jung Ho Ahn is also with Big
                 Data Institute, Seoul National University.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address linearization; application-level techniques;
                 Bandwidth; bank coloring; big memory applications;
                 cache storage; cache write allocation policy;
                 cache-friendly buffers; data shuffling; DRAM bank; DRAM
                 chips; DRAM-based memory; fast-Fourier transform;
                 high-throughput key-value store; Instruction sets;
                 large pages; memory allocators; memory bandwidth;
                 memory controllers; Memory management; memory
                 throughput; multi-threading; multiple threads;
                 Performance gain; Physical-to-DRAM address mapping;
                 radix sort; Random access memory; row-buffer conflicts;
                 Servers; superfluous memory reads; write streaming",
  number-of-cited-references = "14",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Jung:2016:LPS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kalayappan:2016:FRT,
  author =       "Rajshekar Kalayappan and Smruti R. Sarangi",
  title =        "{FluidCheck}: a Redundant Threading-Based Approach for
                 Reliable Execution in Manycore Processors",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842620",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Soft errors have become a serious cause of concern
                 with reducing feature sizes. The ability to accommodate
                 complex, Simultaneous Multithreading (SMT) cores on a
                 single chip presents a unique opportunity to achieve
                 reliable execution, safe from soft errors, with low
                 performance penalties. In this context, we present
                 FluidCheck, a checker architecture that allows highly
                 flexible assignment and migration of checking duties
                 across cores. In this article, we present a mechanism
                 to dynamically use the resources of SMT cores for
                 checking the results of other threads, and propose a
                 variety of heuristics for migration of such checker
                 threads across cores. Secondly, to make the process of
                 checking more efficient, we propose a set of
                 architectural enhancements that reduce power
                 consumption, decrease the length of the critical path,
                 and reduce the load on the Network-on-Chip (NoC). Based
                 on our observations, we design a 16 core system for
                 running SPEC2006 based bag-of-tasks applications. Our
                 experiments demonstrate that fully reliable execution
                 can be attained with a mere 27\% slowdown, surpassing
                 traditional redundant threading based techniques by
                 roughly 42\%.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2016:SEA,
  author =       "Youngho Kim and Joong Chae Na and Heejin Park and
                 Jeong Seop Sim",
  title =        "A space-efficient alphabet-independent
                 {Four-Russians}' lookup table and a multithreaded
                 {Four-Russians}' edit distance algorithm",
  journal =      j-THEOR-COMP-SCI,
  volume =       "656 (Part B)",
  number =       "??",
  pages =        "173--179",
  day =          "20",
  month =        dec,
  year =         "2016",
  CODEN =        "TCSCDI",
  ISSN =         "0304-3975 (print), 1879-2294 (electronic)",
  ISSN-L =       "0304-3975",
  bibdate =      "Fri Dec 9 12:17:02 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tcs2015.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0304397516300676",
  acknowledgement = ack-nhfb,
  fjournal =     "Theoretical Computer Science",
  journal-URL =  "http://www.sciencedirect.com/science/journal/03043975/",
}

@Article{Kutsuna:2016:ARM,
  author =       "Takuro Kutsuna and Yoshinao Ishii",
  title =        "Abstraction and refinement of mathematical functions
                 toward {SMT}-based test-case generation",
  journal =      j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER,
  volume =       "18",
  number =       "1",
  pages =        "109--120",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10009-015-0389-7",
  ISSN =         "1433-2779 (print), 1433-2787 (electronic)",
  ISSN-L =       "1433-2779",
  bibdate =      "Mon Jan 25 08:12:53 MST 2016",
  bibsource =    "http://link.springer.com/journal/10009/18/1;
                 https://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sttt.bib",
  URL =          "http://link.springer.com/article/10.1007/s10009-015-0389-7",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal on Software Tools for Technology
                 Transfer (STTT)",
  journal-URL =  "http://link.springer.com/journal/10009",
}

@Article{Lai:2016:QMD,
  author =       "Bo-Cheng Charles Lai and Luis Garrido Platero and
                 Hsien-Kai Kuo",
  title =        "A Quantitative Method to Data Reuse Patterns of {SIMT}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "73--76",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2491279",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Understanding data reuse patterns of a computing
                 system is crucial to effective design optimization. The
                 emerging Single Instruction Multiple Threads (SIMT)
                 processor adopts a programming model that is
                 fundamentally disparate from conventional scalar
                 processors. There is a lack of analytical approaches to
                 quantify the data reuse of SIMT applications. This
                 paper presents a quantitative method to study the data
                 reuse inherent to SIMT applications. A metric, Data
                 Reuse Degree, is defined to measure the amount of
                 reused data between memory references, and associate
                 each data reuse degree to a temporal distance
                 representing the virtual time of the execution process.
                 The experiments are performed on an abstracted SIMT
                 processor that considers the programming model and
                 runtime specifics. The experiments illustrate diverse
                 data reuse patterns of SIMT applications and explore
                 the impacts of architectural limitations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lai, BCC (Reprint Author), Natl Chiao Tung Univ, Dept
                 Elect Engn, Hsinchu 300, Taiwan. Lai, Bo-Cheng Charles,
                 Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300,
                 Taiwan. Platero, Luis Garrido, Barcelona Super Comp
                 Ctr, Barcelona, Spain. Kuo, Hsien-Kai, MediaTek Inc,
                 Hsinchu, Taiwan.",
  author-email = "bclai@mail.nctu.edu.tw luis.garrido.platero@gmail.com
                 hsienkai.kuo@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "MOST [104-2221-E-009-079]",
  funding-text = "This project was supported by MOST grant
                 104-2221-E-009-079.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural limitations; cache memory; Cache memory;
                 computing system; data analysis; data reuse degree;
                 data reuse patterns; design optimization; execution
                 process; Graphics processing units; Instruction sets;
                 Measurement; Memory management; multi-threading;
                 Parallel architectures; Parallel architectures, cache
                 memory, parallel processing; parallel processing;
                 Parallel processing; programming model; scalar
                 processors; SIMT applications; SIMT processors;
                 single-instruction multiple-threads processors; virtual
                 time",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lai:2016:QMD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2016:HBG,
  author =       "Jing Li and Hung-Wei Tseng and Chunbin Lin and Yannis
                 Papakonstantinou and Steven Swanson",
  title =        "{HippogriffDB}: balancing {I/O} and {GPU} bandwidth in
                 big data analytics",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "9",
  number =       "14",
  pages =        "1647--1658",
  month =        oct,
  year =         "2016",
  CODEN =        "????",
  ISSN =         "2150-8097",
  bibdate =      "Wed Oct 12 10:14:56 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "As data sets grow and conventional processor
                 performance scaling slows, data analytics move towards
                 heterogeneous architectures that incorporate hardware
                 accelerators (notably GPUs) to continue scaling
                 performance. However, existing GPU-based databases fail
                 to deal with big data applications efficiently: their
                 execution model suffers from scalability limitations on
                 GPUs whose memory capacity is limited; existing systems
                 fail to consider the discrepancy between fast GPUs and
                 slow storage, which can counteract the benefit of GPU
                 accelerators. In this paper, we propose HippogriffDB,
                 an efficient, scalable GPU-accelerated OLAP system. It
                 tackles the bandwidth discrepancy using compression and
                 an optimized data transfer path. HippogriffDB stores
                 tables in a compressed format and uses the GPU for
                 decompression, trading GPU cycles for the improved I/O
                 bandwidth. To improve the data transfer efficiency,
                 HippogriffDB introduces a peer-to-peer, multi-threaded
                 data transfer mechanism, directly transferring data
                 from the SSD to the GPU. HippogriffDB adopts a
                 query-over-block execution model that provides
                 scalability using a stream-based approach. The model
                 improves kernel efficiency with the operator fusion and
                 double buffering mechanism. We have implemented
                 HippogriffDB using an NVMe SSD, which talks directly to
                 a commercial GPU. Results on two popular benchmarks
                 demonstrate its scalability and efficiency.
                 HippogriffDB outperforms existing GPU-based databases
                 (YDB) and in-memory data analytics (MonetDB) by 1-2
                 orders of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Ling:2016:MTH,
  author =       "Cheng Ling and Tsuyoshi Hamada and Jingyang Gao and
                 Guoguang Zhao and Donghong Sun and Weifeng Shi",
  title =        "{MrBayes tgMC 3++}: a High Performance and
                 Resource-Efficient {GPU}-Oriented Phylogenetic Analysis
                 Method",
  journal =      j-TCBB,
  volume =       "13",
  number =       "5",
  pages =        "845--854",
  month =        sep,
  year =         "2016",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2015.2495202",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Dec 30 16:19:30 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "MrBayes is a widespread phylogenetic inference tool
                 harnessing empirical evolutionary models and Bayesian
                 statistics. However, the computational cost on the
                 likelihood estimation is very expensive, resulting in
                 undesirably long execution time. Although a number of
                 multi-threaded optimizations have been proposed to
                 speed up MrBayes, there are bottlenecks that severely
                 limit the GPU thread-level parallelism of likelihood
                 estimations. This study proposes a high performance and
                 resource-efficient method for GPU-oriented
                 parallelization of likelihood estimations. Instead of
                 having to rely on empirical programming, the proposed
                 novel decomposition storage model implements high
                 performance data transfers implicitly. In terms of
                 performance improvement, a speedup factor of up to 178
                 can be achieved on the analysis of simulated datasets
                 by four Tesla K40 cards. In comparison to the other
                 publicly available GPU-oriented MrBayes, the tgMC$^3$
                 ++ method proposed herein outperforms the tgMC$^3$
                 v1.0, nMC$^3$ v2.1.1 and oMC$^3$ v1.00 methods by
                 speedup factors of up to 1.6, 1.9 and 2.9,
                 respectively. Moreover, tgMC$^3$ ++ supports more
                 evolutionary models and gamma categories, which
                 previous GPU-oriented methods fail to take into
                 analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Liu:2016:PSE,
  author =       "Yongchao Liu and Thomas Hankeln and Bertil Schmidt",
  title =        "Parallel and space-efficient construction of
                 {Burrows--Wheeler} transform and suffix array for big
                 genome data",
  journal =      j-TCBB,
  volume =       "13",
  number =       "3",
  pages =        "592--598",
  month =        may,
  year =         "2016",
  CODEN =        "ITCBCY",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Mon Aug 29 06:50:39 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "Next-generation sequencing technologies have led to
                 the sequencing of more and more genomes, propelling
                 related research into the era of big data. In this
                 paper, we present ParaBWT, a parallelized
                 Burrows--Wheeler transform (BWT) and suffix array
                 construction algorithm for big genome data. In ParaBWT,
                 we have investigated a progressive construction
                 approach to constructing the BWT of single genome
                 sequences in linear space complexity, but with a small
                 constant factor. This approach has been further
                 parallelized using multi-threading based on a
                 master-slave coprocessing model. After gaining the BWT,
                 the suffix array is constructed in a memory-efficient
                 manner. The performance of ParaBWT has been evaluated
                 using two sequences generated from two human genome
                 assemblies: the Ensembl Homo sapiens assembly and the
                 human reference genome. Our performance comparison to
                 FMD-index and Bwt-disk reveals that on 12 CPU cores,
                 ParaBWT runs up to $ 2.2 \times $ faster than FMD-index
                 and up to $ 99.0 \times $ faster than Bwt-disk. BWT
                 construction algorithms for very long genomic sequences
                 are time consuming and (due to their incremental
                 nature) inherently difficult to parallelize. Thus,
                 their parallelization is challenging and even
                 relatively small speedups like the ones of our method
                 over FMD-index are of high importance to research.
                 ParaBWT is written in C++, and is freely available at
                 http://parabwt.sourceforge.net.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Liu:2016:SEA,
  author =       "Qixiao Liu and Miquel Moreto and Jaume Abella and
                 Francisco J. Cazorla and Daniel A. Jimenez and Mateo
                 Valero",
  title =        "Sensible Energy Accounting with Abstract Metering for
                 Multicore Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "60:1--60:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842616",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Chip multicore processors (CMPs) are the preferred
                 processing platform across different domains such as
                 data centers, real-time systems, and mobile devices. In
                 all those domains, energy is arguably the most
                 expensive resource in a computing system. Accurately
                 quantifying energy usage in a multicore environment
                 presents a challenge as well as an opportunity for
                 optimization. Standard metering approaches are not
                 capable of delivering consistent results with shared
                 resources, since the same task with the same inputs may
                 have different energy consumption based on the mix of
                 co-running tasks. However, it is reasonable for
                 data-center operators to charge on the basis of
                 estimated energy usage rather than time since energy is
                 more correlated with their actual cost. This article
                 introduces the concept of Sensible Energy Accounting
                 (SEA). For a task running in a multicore system, SEA
                 accurately estimates the energy the task would have
                 consumed running in isolation with a given fraction of
                 the CMP shared resources. We explain the potential
                 benefits of SEA in different domains and describe two
                 hardware techniques to implement it for a shared
                 last-level cache and on-core resources in SMT
                 processors. Moreover, with SEA, an energy-aware
                 scheduler can find a highly efficient on-chip resource
                 assignment, reducing by up to 39\% the total processor
                 energy for a 4-core system.",
  acknowledgement = ack-nhfb,
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2016:TAA,
  author =       "Peng Liu and Jiyang Yu and Michael C. Huang",
  title =        "Thread-Aware Adaptive Prefetcher on Multicore Systems:
                 Improving the Performance for Multithreaded Workloads",
  journal =      j-TACO,
  volume =       "13",
  number =       "1",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890505",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Apr 5 16:27:36 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most processors employ hardware data prefetching
                 techniques to hide memory access latencies. However,
                 the prefetching requests from different threads on a
                 multicore processor can cause severe interference with
                 prefetching and/or demand requests of others. The data
                 prefetching can lead to significant performance
                 degradation due to shared resource contention on shared
                 memory multicore systems. This article proposes a
                 thread-aware data prefetching mechanism based on
                 low-overhead runtime information to tune prefetching
                 modes and aggressiveness, mitigating the resource
                 contention in the memory system. Our solution has three
                 new components: (1) a self-tuning prefetcher that uses
                 runtime feedback to dynamically adjust data prefetching
                 modes and arguments of each thread, (2) a filtering
                 mechanism that informs the hardware about which
                 prefetching request can cause shared data invalidation
                 and should be discarded, and (3) a limiter thread
                 acceleration mechanism to estimate and accelerate the
                 critical thread which has the longest completion time
                 in the parallel region of execution. On a set of
                 multithreaded parallel benchmarks, our thread-aware
                 data prefetching mechanism improves the overall
                 performance of 64-core system by 13\% over a multimode
                 prefetch baseline system with two-level cache
                 organization and conventional modified, exclusive,
                 shared, and invalid-based directory coherence protocol.
                 We compare our approach with the feedback directed
                 prefetching technique and find that it provides 9\%
                 performance improvement on multicore systems, while
                 saving the memory bandwidth consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lozi:2016:FPL,
  author =       "Jean-Pierre Lozi and Florian David and Ga{\"e}l Thomas
                 and Julia Lawall and Gilles Muller",
  title =        "Fast and Portable Locking for Multicore
                 Architectures",
  journal =      j-TOCS,
  volume =       "33",
  number =       "4",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2845079",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 6 06:45:30 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The scalability of multithreaded applications on
                 current multicore systems is hampered by the
                 performance of lock algorithms, due to the costs of
                 access contention and cache misses. The main
                 contribution presented in this article is a new locking
                 technique, Remote Core Locking (RCL), that aims to
                 accelerate the execution of critical sections in legacy
                 applications on multicore architectures. The idea of
                 RCL is to replace lock acquisitions by optimized remote
                 procedure calls to a dedicated server hardware thread.
                 RCL limits the performance collapse observed with other
                 lock algorithms when many threads try to acquire a lock
                 concurrently and removes the need to transfer
                 lock-protected shared data to the hardware thread
                 acquiring the lock, because such data can typically
                 remain in the server's cache. Other contributions
                 presented in this article include a profiler that
                 identifies the locks that are the bottlenecks in
                 multithreaded applications and that can thus benefit
                 from RCL, and a reengineering tool that transforms
                 POSIX lock acquisitions into RCL locks. Eighteen
                 applications were used to evaluate RCL: the nine
                 applications of the SPLASH-2 benchmark suite, the seven
                 applications of the Phoenix 2 benchmark suite,
                 Memcached, and Berkeley DB with a TPC-C client. Eight
                 of these applications are unable to scale because of
                 locks and benefit from RCL on an x86 machine with four
                 AMD Opteron processors and 48 hardware threads. By
                 using RCL instead of Linux POSIX locks, performance is
                 improved by up to 2.5 times on Memcached, and up to
                 11.6 times on Berkeley DB with the TPC-C client. On a
                 SPARC machine with two Sun Ultrasparc T2+ processors
                 and 128 hardware threads, three applications benefit
                 from RCL. In particular, performance is improved by up
                 to 1.3 times with respect to Solaris POSIX locks on
                 Memcached, and up to 7.9 times on Berkeley DB with the
                 TPC-C client.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lu:2016:VCV,
  author =       "Yaojie Lu and Seyedamin Rooholamin and Sotirios G.
                 Ziavras",
  title =        "Vector Coprocessor Virtualization for Simultaneous
                 Multithreading",
  journal =      j-TECS,
  volume =       "15",
  number =       "3",
  pages =        "57:1--57:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898364",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Jul 21 17:18:13 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Vector coprocessors (VPs), commonly being assigned
                 exclusively to a single thread/core, are not often
                 performance and energy efficient due to mismatches with
                 the vector needs of individual applications. We present
                 in this article an easy-to-implement VP virtualization
                 technique that, when applied, enables a multithreaded
                 VP to simultaneously execute multiple threads of
                 similar or arbitrary vector lengths to achieve improved
                 aggregate utilization. With a vector register file
                 (VRF) virtualization technique invented to dynamically
                 allocate physical vector registers to threads, our VP
                 virtualization approach improves programmer
                 productivity by providing at runtime a distinct
                 physical register name space to each competing thread,
                 thus eliminating the need to solve register-name
                 conflicts statically. We applied our virtualization
                 technique to a multithreaded VP and prototyped an
                 FPGA-based multicore processor system that supports VP
                 sharing as well as power gating for better energy
                 efficiency. Under the dynamic creation of disparate
                 threads, our benchmarking results show impressive VP
                 speedups of up to 333\% and total energy savings of up
                 to 37\% with proper thread scheduling and power gating
                 compared to a similar-sized system that allows VP
                 access to just one thread at a time.",
  acknowledgement = ack-nhfb,
  articleno =    "57",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Machado:2016:CDD,
  author =       "Nuno Machado and Daniel Quinta and Brandon Lucia and
                 Lu{\'\i}s Rodrigues",
  title =        "Concurrency Debugging with Differential Schedule
                 Projections",
  journal =      j-TOSEM,
  volume =       "25",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2016",
  CODEN =        "ATSMER",
  DOI =          "https://doi.org/10.1145/2885495",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Mon May 16 16:22:08 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tosem/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tosem.bib",
  abstract =     "We present Symbiosis: a concurrency debugging
                 technique based on novel differential schedule
                 projections (DSPs). A DSP shows the small set of memory
                 operations and dataflows responsible for a failure, as
                 well as a reordering of those elements that avoids the
                 failure. To build a DSP, Symbiosis first generates a
                 full, failing, multithreaded schedule via thread path
                 profiling and symbolic constraint solving. Symbiosis
                 selectively reorders events in the failing schedule to
                 produce a nonfailing, alternate schedule. A DSP reports
                 the ordering and dataflow differences between the
                 failing and nonfailing schedules. Our evaluation on
                 buggy real-world software and benchmarks shows that, in
                 practical time, Symbiosis generates DSPs that both
                 isolate the small fraction of event orders and
                 dataflows responsible for the failure and report which
                 event reorderings prevent failing. In our experiments,
                 DSPs contain 90\% fewer events and 96\% fewer dataflows
                 than the full failure-inducing schedules. We also
                 conducted a user study that shows that, by allowing
                 developers to focus on only a few events, DSPs reduce
                 the amount of time required to understand the bug's
                 root cause and find a valid fix.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J790",
}

@Article{Marino:2016:DXU,
  author =       "Daniel Marino and Abhayendra Singh and Todd Millstein
                 and Madanlal Musuvathi and Satish Narayanasamy",
  title =        "{drf x}: an Understandable, High Performance, and
                 Flexible Memory Model for Concurrent Languages",
  journal =      j-TOPLAS,
  volume =       "38",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2016",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2925988",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Oct 18 11:41:44 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "The most intuitive memory model for shared-memory
                 multi-threaded programming is sequential consistency
                 (SC), but it disallows the use of many compiler and
                 hardware optimizations and thus affects performance.
                 Data-race-free (DRF) models, such as the C++11 memory
                 model, guarantee SC execution for data-race-free
                 programs. But these models provide no guarantee at all
                 for racy programs, compromising the safety and
                 debuggability of such programs. To address the safety
                 issue, the Java memory model, which is also based on
                 the DRF model, provides a weak semantics for racy
                 executions. However, this semantics is subtle and
                 complex, making it difficult for programmers to reason
                 about their programs and for compiler writers to ensure
                 the correctness of compiler optimizations. We present
                 the drf x memory model, which is simple for programmers
                 to understand and use while still supporting many
                 common optimizations. We introduce a memory model (MM)
                 exception that can be signaled to halt execution. If a
                 program executes without throwing this exception, then
                 drf x guarantees that the execution is SC. If a program
                 throws an MM exception during an execution, then drf x
                 guarantees that the program has a data race. We observe
                 that SC violations can be detected in hardware through
                 a lightweight form of conflict detection. Furthermore,
                 our model safely allows aggressive compiler and
                 hardware optimizations within compiler-designated
                 program regions. We formalize our memory model, prove
                 several properties of this model, describe a compiler
                 and hardware design suitable for drf x, and evaluate
                 the performance overhead due to our compiler and
                 hardware requirements.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Marinov:2016:PAF,
  author =       "Martin Marinov and Nicholas Nash and David Gregg",
  title =        "Practical Algorithms for Finding Extremal Sets",
  journal =      j-ACM-J-EXP-ALGORITHMICS,
  volume =       "21",
  number =       "1",
  pages =        "1.9:1--1.9:??",
  month =        nov,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2893184",
  ISSN =         "1084-6654",
  bibdate =      "Fri Nov 4 16:46:55 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jea.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The minimal sets within a collection of sets are
                 defined as the ones that do not have a proper subset
                 within the collection, and the maximal sets are the
                 ones that do not have a proper superset within the
                 collection. Identifying extremal sets is a fundamental
                 problem with a wide range of applications in SAT
                 solvers, data mining, and social network analysis. In
                 this article, we present two novel improvements of the
                 high-quality extremal set identification algorithm,
                 AMS-Lex, described by Bayardo and Panda. The first
                 technique uses memoization to improve the execution
                 time of the single-threaded variant of the AMS-Lex,
                 while our second improvement uses parallel programming
                 methods. In a subset of the presented experiments, our
                 memoized algorithm executes more than 400 times faster
                 than the highly efficient publicly available
                 implementation of AMS-Lex. Moreover, we show that our
                 modified algorithm's speedup is not bounded above by a
                 constant and that it increases as the length of the
                 common prefixes in successive input itemsets increases.
                 We provide experimental results using both real-world
                 and synthetic datasets, and show our multithreaded
                 variant algorithm outperforming AMS-Lex by 3 to 6
                 times. We find that on synthetic input datasets, when
                 executed using 16 CPU cores of a 32-core machine, our
                 multithreaded program executes about as fast as the
                 state-of-the-art parallel GPU-based program using an
                 NVIDIA GTX 580 graphics processing unit.",
  acknowledgement = ack-nhfb,
  articleno =    "1.9",
  fjournal =     "Journal of Experimental Algorithmics (JEA)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J430",
}

@Article{Moreland:2016:VMA,
  author =       "Kenneth Moreland and Christopher Sewell and William
                 Usher and Li-ta Lo and Jeremy Meredith and David
                 Pugmire and James Kress and Hendrik Schroots and
                 Kwan-Liu Ma and Hank Childs and Matthew Larsen and
                 Chun-Ming Chen and Robert Maynard and Berk Geveci",
  title =        "{VTK-m}: Accelerating the Visualization Toolkit for
                 Massively Threaded Architectures",
  journal =      j-IEEE-CGA,
  volume =       "36",
  number =       "3",
  pages =        "48--58",
  month =        may # "\slash " # jun,
  year =         "2016",
  CODEN =        "ICGADZ",
  ISSN =         "0272-1716 (print), 1558-1756 (electronic)",
  ISSN-L =       "0272-1716",
  bibdate =      "Wed Oct 5 07:24:20 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecga.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.computer.org/csdl/mags/cg/2016/03/mcg2016030048-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=38",
}

@Article{Narayanaswamy:2016:VCA,
  author =       "Ganesh Narayanaswamy and Saurabh Joshi and Daniel
                 Kroening",
  title =        "The virtues of conflict: analysing modern
                 concurrency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851165",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern shared memory multiprocessors permit reordering
                 of memory operations for performance reasons. These
                 reorderings are often a source of subtle bugs in
                 programs written for such architectures. Traditional
                 approaches to verify weak memory programs often rely on
                 interleaving semantics, which is prone to state space
                 explosion, and thus severely limits the scalability of
                 the analysis. In recent times, there has been a renewed
                 interest in modelling dynamic executions of weak memory
                 programs using partial orders. However, such an
                 approach typically requires ad-hoc mechanisms to
                 correctly capture the data and control-flow
                 choices/conflicts present in real-world programs. In
                 this work, we propose a novel, conflict-aware,
                 composable, truly concurrent semantics for programs
                 written using C/C++ for modern weak memory
                 architectures. We exploit our symbolic semantics based
                 on general event structures to build an efficient
                 decision procedure that detects assertion violations in
                 bounded multi-threaded programs. Using a large,
                 representative set of benchmarks, we show that our
                 conflict-aware semantics outperforms the
                 state-of-the-art partial-order based approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Nogueira:2016:BBW,
  author =       "David Nogueira and Pedro Tomas and Nuno Roma",
  title =        "{BowMapCL}: {Burrows--Wheeler} Mapping on Multiple
                 Heterogeneous Accelerators",
  journal =      j-TCBB,
  volume =       "13",
  number =       "5",
  pages =        "926--938",
  month =        sep,
  year =         "2016",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2015.2495149",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Fri Dec 30 16:19:30 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  abstract =     "The computational demand of exact-search procedures
                 has pressed the exploitation of parallel processing
                 accelerators to reduce the execution time of many
                 applications. However, this often imposes strict
                 restrictions in terms of the problem size and
                 implementation efforts, mainly due to their possibly
                 distinct architectures. To circumvent this limitation,
                 a new exact-search alignment tool BowMapCL based on the
                 Burrows--Wheeler Transform and FM-Index is presented.
                 Contrasting to other alternatives, BowMapCL is based on
                 a unified implementation using OpenCL, allowing the
                 exploitation of multiple and possibly different devices
                 e.g., NVIDIA, AMD/ATI, and Intel GPUs/APUs.
                 Furthermore, to efficiently exploit such heterogeneous
                 architectures, BowMapCL incorporates several techniques
                 to promote its performance and scalability, including
                 multiple buffering, work-queue task-distribution, and
                 dynamic load-balancing, together with index
                 partitioning, bit-encoding, and sampling. When compared
                 with state-of-the-art tools, the attained results
                 showed that BowMapCL using a single GPU is $ 2 \times $
                 to $ 7.5 \times $ faster than mainstream multi-threaded
                 CPU BWT-based aligners, like Bowtie, BWA, and SOAP2;
                 and up to $ 4 \times $ faster than the best performing
                 state-of-the-art GPU implementations namely, SOAP3 and
                 HPG-BWT. When multiple and completely distinct devices
                 are considered, BowMapCL efficiently scales the offered
                 throughput, ensuring a convenient load-balance of the
                 involved processing in the several distinct devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J954",
}

@Article{Norris:2016:PAM,
  author =       "Brian Norris and Brian Demsky",
  title =        "A Practical Approach for Model Checking {C\slash
                 C++11} Code",
  journal =      j-TOPLAS,
  volume =       "38",
  number =       "3",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2016",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2806886",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Mon May 2 16:24:58 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Writing low-level concurrent software has
                 traditionally required intimate knowledge of the entire
                 toolchain and often has involved coding in assembly.
                 New language standards have extended C and C++ with
                 support for low-level atomic operations and a weak
                 memory model, enabling developers to write portable and
                 efficient multithreaded code. In this article, we
                 present CDSChecker, a tool for exhaustively exploring
                 the behaviors of concurrent code under the C/C++ memory
                 model. We have used CDSChecker to exhaustively unit
                 test concurrent data structure implementations and have
                 discovered errors in a published implementation of a
                 work-stealing queue and a single producer, single
                 consumer queue.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Papadopoulos:2016:TAD,
  author =       "Stavros Papadopoulos and Kushal Datta and Samuel
                 Madden and Timothy Mattson",
  title =        "The {TileDB} array data storage manager",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "10",
  number =       "4",
  pages =        "349--360",
  month =        nov,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3025111.3025117",
  ISSN =         "2150-8097",
  bibdate =      "Sat Feb 25 09:01:51 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "We present a novel storage manager for
                 multi-dimensional arrays that arise in scientific
                 applications, which is part of a larger scientific data
                 management system called TileDB. In contrast to
                 existing solutions, TileDB is optimized for both dense
                 and sparse arrays. Its key idea is to organize array
                 elements into ordered collections called fragments.
                 Each fragment is dense or sparse, and groups contiguous
                 array elements into data tiles of fixed capacity. The
                 organization into fragments turns random writes into
                 sequential writes, and, coupled with a novel read
                 algorithm, leads to very efficient reads. TileDB
                 enables parallelization via multi-threading and
                 multi-processing, offering thread-/process-safety and
                 atomicity via lightweight locking. We show that TileDB
                 delivers comparable performance to the HDF5 dense array
                 storage manager, while providing much faster random
                 writes. We also show that TileDB offers substantially
                 faster reads and writes than the SciDB array database
                 system with both dense and sparse arrays. Finally, we
                 demonstrate that TileDB is considerably faster than
                 adaptations of the Vertica relational column-store for
                 dense array storage management, and at least as fast
                 for the case of sparse arrays.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Park:2016:CJP,
  author =       "Hyukwoo Park and Myungsu Cha and Soo-Mook Moon",
  title =        "Concurrent {JavaScript} Parsing for Faster Loading of
                 {Web} Apps",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "41:1--41:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3004281",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "JavaScript is a dynamic language mainly used as a
                 client-side web script. Nowadays, web is evolving into
                 an application platform with its web apps, and
                 JavaScript increasingly undertakes complex computations
                 and interactive user interfaces, requiring a
                 high-performance JavaScript engine. There have been
                 many optimizations for efficient JavaScript engines,
                 but one component that has not been optimized much is
                 JavaScript parsing. A JavaScript function needs to be
                 parsed before being executed, and the parsing overhead
                 takes a substantial portion of JavaScript execution
                 time for web apps, especially during app loading. This
                 article proposes concurrent parsing of JavaScript,
                 which performs the parsing of JavaScript functions in
                 advance on different threads, while the main thread is
                 executing the parsed JavaScript functions. This can
                 hide the parsing overhead from the main execution
                 thread, reducing the JavaScript execution time, thus
                 reducing the overall app loading time. More
                 specifically, we separated JavaScript parsing and made
                 it run on different threads without violating the
                 execution semantics of JavaScript. We also designed an
                 efficient multi-threaded parsing architecture, which
                 reduces the synchronization overhead and schedules the
                 parsing requests appropriately. Finally, we explored
                 two methods of choosing the target functions for
                 concurrent parsing: one based on profiled information
                 and the other based on speculative heuristics. We
                 performed experiments on the WebKit browser with the
                 JSC engine for real web apps. The result shows that the
                 proposed concurrent parsing can improve the JavaScript
                 performance during app loading by as much as 64\% and
                 by 39.7\% on average. This improves the whole app
                 loading performance tangibly, by as much as 32.7\% and
                 by 18.2\%, on average.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pusukuri:2016:TEL,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "{Tumbler}: an Effective Load-Balancing Technique for
                 Multi-{CPU} Multicore Systems",
  journal =      j-TACO,
  volume =       "12",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2827698",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Feb 16 15:36:38 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Schedulers used by modern OSs (e.g., Oracle Solaris
                 11{\trademark} and GNU/Linux) balance load by balancing
                 the number of threads in run queues of different cores.
                 While this approach is effective for a single CPU
                 multicore system, we show that it can lead to a
                 significant load imbalance across CPUs of a multi-CPU
                 multicore system. Because different threads of a
                 multithreaded application often exhibit different
                 levels of CPU utilization, load cannot be measured in
                 terms of the number of threads alone. We propose
                 Tumbler that migrates the threads of a multithreaded
                 program across multiple CPUs to balance the load across
                 the CPUs. While Tumbler distributes the threads equally
                 across the CPUs, its assignment of threads to CPUs is
                 aimed at minimizing the variation in utilization of
                 different CPUs to achieve load balance. We evaluated
                 Tumbler using a wide variety of 35 multithreaded
                 applications, and our experimental results show that
                 Tumbler outperforms both Oracle Solaris 11{\trademark}
                 and GNU/Linux.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Qian:2016:EFS,
  author =       "Junjie Qian and Witawas Srisa-an and Sharad Seth and
                 Hong Jiang and Du Li and Pan Yi",
  title =        "Exploiting {FIFO} Scheduler to Improve Parallel
                 Garbage Collection Performance",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "109--121",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892248",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent studies have found that parallel garbage
                 collection performs worse with more CPUs and more
                 collector threads. As part of this work, we further
                 investigate this phenomenon and find that poor
                 scalability is worst in highly scalable Java
                 applications. Our investigation to find the causes
                 clearly reveals that efficient multi-threading in an
                 application can prolong the average object lifespan,
                 which results in less effective garbage collection. We
                 also find that prolonging lifespan is the direct result
                 of Linux's Completely Fair Scheduler due to its
                 round-robin like behavior that can increase the heap
                 contention between the application threads. Instead, if
                 we use pseudo first-in-first-out to schedule
                 application threads in large multicore systems, the
                 garbage collection scalability is significantly
                 improved while the time spent in garbage collection is
                 reduced by as much as 21\%. The average execution time
                 of the 24 Java applications used in our study is also
                 reduced by 11\%. Based on this observation, we propose
                 two approaches to optimally select scheduling policies
                 based on application scalability profile. Our first
                 approach uses the profile information from one
                 execution to tune the subsequent executions. Our second
                 approach dynamically collects profile information and
                 performs policy selection during execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Qian:2016:ODG,
  author =       "Xuehai Qian and Koushik Sen and Paul Hargrove and
                 Costin Iancu",
  title =        "{OPR}: deterministic group replay for one-sided
                 communication",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "47:1--47:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851179",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ability to reproduce a parallel execution is
                 desirable for debugging and program reliability
                 purposes. In debugging (13), the programmer needs to
                 manually step back in time, while for resilience (6)
                 this is automatically performed by the application upon
                 failure. To be useful, replay has to faithfully
                 reproduce the original execution. For parallel programs
                 the main challenge is inferring and maintaining the
                 order of conflicting operations (data races).
                 Deterministic record and replay (R{\&}R) techniques
                 have been developed for multithreaded shared memory
                 programs (5), as well as distributed memory programs
                 (14). Our main interest is techniques for large scale
                 scientific (3; 4) programming models.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Radojkovic:2016:TAM,
  author =       "P. Radojkovic and P. M. Carpenter and M. Moreto and V.
                 Cakarevic and J. Verdu and A. Pajuelo and F. J. Cazorla
                 and M. Nemirovsky and M. Valero",
  title =        "Thread Assignment in Multicore\slash Multithreaded
                 Processors: A Statistical Approach",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "65",
  number =       "1",
  pages =        "256--269",
  month =        "????",
  year =         "2016",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2015.2417533",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Tue Dec 15 09:36:24 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Samak:2016:DSF,
  author =       "Malavika Samak and Omer Tripp and Murali Krishna
                 Ramanathan",
  title =        "Directed synthesis of failing concurrent executions",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "430--446",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Detecting concurrency-induced bugs in multithreaded
                 libraries can be challenging due to the intricacies
                 associated with their manifestation. This includes
                 invocation of multiple methods, synthesis of inputs to
                 the methods to reach the failing location, and crafting
                 of thread interleavings that cause the erroneous
                 behavior. Neither fuzzing-based testing techniques nor
                 over-approximate static analyses are well positioned to
                 detect such subtle defects while retaining high
                 accuracy alongside satisfactory coverage. In this
                 paper, we propose a directed, iterative and scalable
                 testing engine that combines the strengths of static
                 and dynamic analysis to help synthesize concurrent
                 executions to expose complex concurrency-induced bugs.
                 Our engine accepts as input the library, its client
                 (either sequential or concurrent) and a specification
                 of correctness. Then, it iteratively refines the client
                 to generate an execution that can break the input
                 specification. Each step of the iterative process
                 includes statically identifying sub-goals towards the
                 goal of failing the specification, generating a plan
                 toward meeting these goals, and merging of the paths
                 traversed dynamically with the plan computed statically
                 via constraint solving to generate a new client. The
                 engine reports full reproduction scenarios, guaranteed
                 to be true, for the bugs it finds. We have created a
                 prototype of our approach named MINION. We validated
                 MINION by applying it to well-tested concurrent classes
                 from popular Java libraries, including the latest
                 versions of OpenJDK and Google-Guava. We were able to
                 detect 31 real crashes across 10 classes in a total of
                 23 minutes, including previously unknown bugs.
                 Comparison with three other tools reveals that
                 combined, they report only 9 of the 31 crashes (and no
                 other crashes beyond MINION). This is because several
                 of these bugs manifest under deeply nested path
                 conditions (observed maximum of 11), deep nesting of
                 method invocations (observed maximum of 6) and multiple
                 refinement iterations to generate the crash-inducing
                 client.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Sleiman:2016:ESO,
  author =       "Faissal M. Sleiman and Thomas F. Wenisch",
  title =        "Efficiently scaling out-of-order cores for
                 simultaneous multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "431--443",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001183",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Simultaneous multithreading (SMT) out-of-order cores
                 waste a significant portion of structural out-of-order
                 core resources on instructions that do not need them.
                 These resources eliminate false ordering dependences.
                 However, because thread interleaving spreads dependent
                 instructions, nearly half of instructions dynamically
                 issue in program order after all false dependences have
                 resolved. These in-sequence instructions interleave
                 with other reordered instructions at a fine granularity
                 within the instruction window. We develop a technique
                 to efficiently scale in-flight instructions through a
                 hybrid out-of-order/in-order microarchitecture, which
                 can dispatch instructions to efficient in-order
                 scheduling mechanisms---using a FIFO issue queue called
                 the shelf ---on an instruction-by-instruction basis.
                 Instructions dispatched to the shelf do not allocate
                 out-of-order core resources in the reorder buffer,
                 issue queue, physical registers, or load-store queues.
                 We measure opportunity for such hybrid
                 microarchitectures and design and evaluate a practical
                 dispatch mechanism targeted at 4-threaded cores. Adding
                 a shelf to a baseline 4-thread system with 64-entry ROB
                 improves normalized system throughput by 11.5\% (up to
                 19.2\% at best) and energy-delay product by 10.9\% (up
                 to 17.5\% at best).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Tian:2016:ETR,
  author =       "Zhenzhou Tian and Ting Liu and Qinghua Zheng and Ming
                 Fan and Eryue Zhuang and Zijiang Yang",
  title =        "Exploiting thread-related system calls for plagiarism
                 detection of multithreaded programs",
  journal =      j-J-SYST-SOFTW,
  volume =       "119",
  number =       "??",
  pages =        "136--148",
  month =        sep,
  year =         "2016",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Sat Jul 16 18:10:04 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0164121216300838",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Systems and Software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212/",
}

@Article{Vale:2016:PDT,
  author =       "Tiago M. Vale and Jo{\~a}o A. Silva and Ricardo J.
                 Dias and Jo{\~a}o M. Louren{\c{c}}o",
  title =        "{Pot}: Deterministic Transactional Execution",
  journal =      j-TACO,
  volume =       "13",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3017993",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Dec 28 16:24:46 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents Pot, a system that leverages the
                 concept of preordered transactions to achieve
                 deterministic multithreaded execution of programs that
                 use Transactional Memory. Preordered transactions
                 eliminate the root cause of nondeterminism in
                 transactional execution: they provide the illusion of
                 executing in a deterministic serial order, unlike
                 traditional transactions that appear to execute in a
                 nondeterministic order that can change from execution
                 to execution. Pot uses a new concurrency control
                 protocol that exploits the serialization order to
                 distinguish between fast and speculative transaction
                 execution modes in order to mitigate the overhead of
                 imposing a deterministic order. We build two Pot
                 prototypes: one using STM and another using
                 off-the-shelf HTM. To the best of our knowledge, Pot
                 enables deterministic execution of programs using
                 off-the-shelf HTM for the first time. An experimental
                 evaluation shows that Pot achieves deterministic
                 execution of TM programs with low overhead, sometimes
                 even outperforming nondeterministic executions, and
                 clearly outperforming the state of the art.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{VanZee:2016:BFE,
  author =       "Field G. {Van Zee} and Tyler M. Smith and Bryan Marker
                 and Tze Meng Low and Robert A. {Van De Geijn} and
                 Francisco D. Igual and Mikhail Smelyanskiy and Xianyi
                 Zhang and Michael Kistler and Vernon Austel and John A.
                 Gunnels and Lee Killough",
  title =        "The {BLIS} Framework: Experiments in Portability",
  journal =      j-TOMS,
  volume =       "42",
  number =       "2",
  pages =        "12:1--12:19",
  month =        jun,
  year =         "2016",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/2755561",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Fri Jun 3 18:52:21 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "BLIS is a new software framework for instantiating
                 high-performance BLAS-like dense linear algebra
                 libraries. We demonstrate how BLIS acts as a
                 productivity multiplier by using it to implement the
                 level-3 BLAS on a variety of current architectures. The
                 systems for which we demonstrate the framework include
                 state-of-the-art general-purpose, low-power, and
                 many-core architectures. We show, with very little
                 effort, how the BLIS framework yields sequential and
                 parallel implementations that are competitive with the
                 performance of ATLAS, OpenBLAS (an effort to maintain
                 and extend the GotoBLAS), and commercial vendor
                 implementations such as AMD's ACML, IBM's ESSL, and
                 Intel's MKL libraries. Although most of this article
                 focuses on single-core implementation, we also provide
                 compelling results that suggest the framework's
                 leverage extends to the multithreaded domain.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Verdu:2016:PSA,
  author =       "Javier Verdu and Alex Pajuelo",
  title =        "Performance Scalability Analysis of {JavaScript}
                 Applications with {Web Workers}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2494585",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Web applications are getting closer to the performance
                 of native applications taking advantage of new
                 standard-based technologies. The recent HTML5 standard
                 includes, among others, the Web Workers API that allows
                 executing JavaScript applications on multiple threads,
                 or workers. However, the internals of the browser's
                 JavaScript virtual machine does not expose direct
                 relation between workers and running threads in the
                 browser and the utilization of logical cores in the
                 processor. As a result, developers do not know how
                 performance actually scales on different environments
                 and therefore what is the optimal number of workers on
                 parallel JavaScript codes. This paper presents the
                 first performance scalability analysis of parallel web
                 apps with multiple workers. We focus on two case
                 studies representative of different worker execution
                 models. Our analyses show performance scaling on
                 different parallel processor microarchitectures and on
                 three major web browsers in the market. Besides, we
                 study the impact of co-running applications on the web
                 app performance. The results provide insights for
                 future approaches to automatically find out the optimal
                 number of workers that provide the best tradeoff
                 between performance and resource usage to preserve
                 system responsiveness and user experience, especially
                 on environments with unexpected changes on system
                 workload.",
  acknowledgement = ack-nhfb,
  affiliation =  "Verdu, J (Reprint Author), BarcelonaTECH UPC, Dept
                 Comp Architecture, Barcelona, Spain. Verdu, Javier;
                 Pajuelo, Alex, BarcelonaTECH UPC, Dept Comp
                 Architecture, Barcelona, Spain.",
  author-email = "jverdu@ac.upc.edu mpajuelo@ac.upc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Economy and
                 Competitiveness (MINECO) [TIN2012-34557]",
  funding-text = "This work has been supported by the Spanish Ministry
                 of Economy and Competitiveness (MINECO) under contract
                 TIN2012-34557.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "HTML5; javascript; multithreading; parallelism; web
                 apps; web workers",
  number-of-cited-references = "12",
  oa =           "Green Published",
  ORCID-numbers = "Pajuelo, Alex/0000-0002-5510-6860 Verdu Mula,
                 Javier/0000-0003-4485-2419",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Verdu:2016:PSA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yao:2016:OCO,
  author =       "Yuan Yao and Zhonghai Lu",
  title =        "Opportunistic competition overhead reduction for
                 expediting critical section in {NoC} based {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "279--290",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001167",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the degree of parallelism increasing, performance
                 of multi-threaded shared variable applications is not
                 only limited by serialized critical section execution,
                 but also by the serialized competition overhead for
                 threads to get access to critical section. As the
                 number of concurrent threads grows, such competition
                 overhead may exceed the time spent in critical section
                 itself, and become the dominating factor limiting the
                 performance of parallel applications. In modern
                 operating systems, queue spinlock, which comprises a
                 low-overhead spinning phase and a high-overhead
                 sleeping phase, is often used to lock critical
                 sections. In the paper, we show that this advanced
                 locking solution may create very high competition
                 overhead for multithreaded applications executing in
                 NoC-based CMPs. Then we propose a software-hardware
                 cooperative mechanism that can opportunistically
                 maximize the chance that a thread wins the critical
                 section access in the low-overhead spinning phase,
                 thereby reducing the competition overhead. At the OS
                 primitives level, we monitor the remaining times of
                 retry (RTR) in a thread's spinning phase, which
                 reflects in how long the thread must enter into the
                 high-overhead sleep mode. At the hardware level, we
                 integrate the RTR information into the packets of
                 locking requests, and let the NoC prioritize locking
                 request packets according to the RTR information. The
                 principle is that the smaller RTR a locking request
                 packet carries, the higher priority it gets and thus
                 quicker delivery. We evaluate our opportunistic
                 competition overhead reduction technique with
                 cycle-accurate full-system simulations in GEM5 using
                 PARSEC (11 programs) and SPEC OMP2012 (14 programs)
                 benchmarks. Compared to the original queue spinlock
                 implementation, experimental results show that our
                 method can effectively increase the opportunity of
                 threads entering the critical section in low-overhead
                 spinning phase, reducing the competition overhead
                 averagely by 39.9\% (maximally by 61.8\%) and
                 accelerating the execution of the Region-of-Interest
                 averagely by 14.4\% (maximally by 24.5\%) across all 25
                 benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Yiapanis:2016:CDS,
  author =       "Paraskevas Yiapanis and Gavin Brown and Mikel
                 Luj{\'a}n",
  title =        "Compiler-Driven Software Speculation for Thread-Level
                 Parallelism",
  journal =      j-TOPLAS,
  volume =       "38",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2016",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/2821505",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Tue Jan 5 16:31:06 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toplas/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "Current parallelizing compilers can tackle
                 applications exercising regular access patterns on
                 arrays or affine indices, where data dependencies can
                 be expressed in a linear form. Unfortunately, there are
                 cases that independence between statements of code
                 cannot be guaranteed and thus the compiler
                 conservatively produces sequential code. Programs that
                 involve extensive pointer use, irregular access
                 patterns, and loops with unknown number of iterations
                 are examples of such cases. This limits the extraction
                 of parallelism in cases where dependencies are rarely
                 or never triggered at runtime. Speculative parallelism
                 refers to methods employed during program execution
                 that aim to produce a valid parallel execution schedule
                 for programs immune to static parallelization. The
                 motivation for this article is to review recent
                 developments in the area of compiler-driven software
                 speculation for thread-level parallelism and how they
                 came about. The article is divided into two parts. In
                 the first part the fundamentals of speculative
                 parallelization for thread-level parallelism are
                 explained along with a design choice categorization for
                 implementing such systems. Design choices include the
                 ways speculative data is handled, how data dependence
                 violations are detected and resolved, how the correct
                 data are made visible to other threads, or how
                 speculative threads are scheduled. The second part is
                 structured around those design choices providing the
                 advances and trends in the literature with reference to
                 key developments in the area. Although the focus of the
                 article is in software speculative parallelization, a
                 section is dedicated for providing the interested
                 reader with pointers and references for exploring
                 similar topics such as hardware thread-level
                 speculation, transactional memory, and automatic
                 parallelization.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Yu:2016:DLR,
  author =       "Hairong Yu and Guohui Li and Jianjun Li and Lihchyun
                 Shu",
  title =        "{DO$_{\rm cyclical}$}: a Latency-Resistant Cyclic
                 Multi-Threading Approach for Automatic Program
                 Parallelization",
  journal =      j-COMP-J,
  volume =       "59",
  number =       "8",
  pages =        "1155--1173",
  month =        aug,
  year =         "2016",
  CODEN =        "CMPJA6",
  DOI =          "https://doi.org/10.1093/comjnl/bxv125",
  ISSN =         "0010-4620 (print), 1460-2067 (electronic)",
  ISSN-L =       "0010-4620",
  bibdate =      "Tue Aug 30 07:10:50 MDT 2016",
  bibsource =    "http://comjnl.oxfordjournals.org/content/59/8.toc;
                 https://www.math.utah.edu/pub/tex/bib/compj2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://comjnl.oxfordjournals.org/content/59/8/1155",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Journal",
  journal-URL =  "http://comjnl.oxfordjournals.org/",
  onlinedate =   "January 14, 2016",
}

@Article{Zhang:2016:SAN,
  author =       "Mingzhe Zhang and Francis C. M. Lau and Cho-Li Wang
                 and Luwei Cheng and Haibo Chen",
  title =        "Scalable adaptive {NUMA}-aware lock: combining local
                 locking and remote locking for efficient concurrency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "50:1--50:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scalable locking is a key building block for scalable
                 multi-threaded software. Its performance is especially
                 critical in multi-socket, multi-core machines with
                 non-uniform memory access (NUMA). Previous schemes such
                 as local locking and remote locking only perform well
                 under a certain level of contention, and often require
                 non-trivial tuning for a particular configuration.
                 Besides, for large NUMA systems, because of unmanaged
                 lock server's nomination, current distance-first NUMA
                 policies cannot perform satisfactorily. In this work,
                 we propose SANL, a locking scheme that can deliver high
                 performance under various contention levels by
                 adaptively switching between the local and the remote
                 lock scheme. Furthermore, we introduce a new NUMA
                 policy for the remote lock that jointly considers node
                 distances and server utilization when choosing lock
                 servers. A comparison with seven representative locking
                 schemes shows that SANL outperforms the others in most
                 contention situations. In one group test, SANL is 3.7
                 times faster than RCL lock and 17 times faster than
                 POSIX mutex.",
  acknowledgement = ack-nhfb,
  articleno =    "50",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Zhang:2016:TED,
  author =       "Tong Zhang and Dongyoon Lee and Changhee Jung",
  title =        "{TxRace}: Efficient Data Race Detection Using
                 Commodity Hardware Transactional Memory",
  journal =      j-OPER-SYS-REV,
  volume =       "50",
  number =       "2",
  pages =        "159--173",
  month =        jun,
  year =         "2016",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/2954680.2872384",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Thu Jun 9 17:03:34 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/opersysrev.bib",
  abstract =     "Detecting data races is important for debugging
                 shared-memory multithreaded programs, but the high
                 runtime overhead prevents the wide use of dynamic data
                 race detectors. This paper presents TxRace, a new
                 software data race detector that leverages commodity
                 hardware transactional memory (HTM) to speed up data
                 race detection. TxRace instruments a multithreaded
                 program to transform synchronization-free regions into
                 transactions, and exploits the conflict detection
                 mechanism of HTM for lightweight data race detection at
                 runtime. However, the limitations of the current
                 best-effort commodity HTMs expose several challenges in
                 using them for data race detection: (1) lack of ability
                 to pinpoint racy instructions, (2) false positives
                 caused by cache line granularity of conflict detection,
                 and (3) transactional aborts for non-conflict reasons
                 (e.g., capacity or unknown). To overcome these
                 challenges, TxRace performs lightweight HTM-based data
                 race detection at first, and occasionally switches to
                 slow yet precise data race detection only for the small
                 fraction of execution intervals in which potential
                 races are reported by HTM. According to the
                 experimental results, TxRace reduces the average
                 runtime overhead of dynamic data race detection from
                 11.68x to 4.65x with only a small number of false
                 negatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Areias:2017:SDP,
  author =       "Miguel Areias and Ricardo Rocha",
  title =        "On scaling dynamic programming problems with a
                 multithreaded tabling {Prolog} system",
  journal =      j-J-SYST-SOFTW,
  volume =       "125",
  number =       "??",
  pages =        "417--426",
  month =        mar,
  year =         "2017",
  CODEN =        "JSSODM",
  ISSN =         "0164-1212 (print), 1873-1228 (electronic)",
  ISSN-L =       "0164-1212",
  bibdate =      "Sat Feb 4 12:20:39 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "//www.sciencedirect.com/science/article/pii/S0164121216300929",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Systems and Software",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01641212/",
}

@Article{Arteaga:2017:GFG,
  author =       "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R.
                 Gao",
  title =        "Generating Fine-Grain Multithreaded Applications Using
                 a Multigrain Approach",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155288",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The recent evolution in hardware landscape, aimed at
                 producing high-performance computing systems capable of
                 reaching extreme-scale performance, has reignited the
                 interest in fine-grain multithreading, particularly at
                 the intranode level. Indeed, popular parallel
                 programming environments, such as OpenMP, which
                 features a simple interface for the parallelization of
                 programs, are now incorporating fine-grain constructs.
                 However, since coarse-grain directives are still
                 heavily used, the OpenMP runtime is forced to support
                 both coarse- and fine-grain models of execution,
                 potentially reducing the advantages obtained when
                 executing an application in a fully fine-grain
                 environment. To evaluate the type of applications that
                 benefit from executing in a unified fine-grain program
                 execution model, this article presents a multigrain
                 parallel programming environment for the generation of
                 fine-grain multithreaded applications from programs
                 featuring OpenMP's API, allowing OpenMP programs to be
                 run on top of a fine-grain event-driven program
                 execution model. Experimental results with five
                 scientific benchmarks show that fine-grain
                 applications, generated by and run on our environment
                 with two runtimes implementing a fine-grain
                 event-driven program execution model, are competitive
                 and can outperform their OpenMP counterparts,
                 especially for data-intensive workloads with irregular
                 and dynamic parallelism, reaching speedups as high as
                 2.6$ \times $ for Graph500 and 51$ \times $ for NAS
                 Data Cube.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bender:2017:TLM,
  author =       "Michael A. Bender and Jonathan W. Berry and Simon D.
                 Hammond and K. Scott Hemmert and Samuel McCauley and
                 Branden Moore and Benjamin Moseley and Cynthia A.
                 Phillips and David Resnick and Arun Rodrigues",
  title =        "Two-level main memory co-design: Multi-threaded
                 algorithmic primitives, analysis, and simulation",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "102",
  number =       "??",
  pages =        "213--228",
  month =        apr,
  year =         "2017",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Wed Jan 25 14:20:18 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S074373151630185X",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315/",
}

@Book{Blandy:2017:PR,
  author =       "Jim Blandy and Jason Orendorff",
  title =        "Programming {Rust}",
  publisher =    pub-ORA-MEDIA,
  address =      pub-ORA-MEDIA:adr,
  pages =        "xx + 598",
  year =         "2017",
  ISBN =         "1-4919-2728-3 (paperback), 1-4919-2727-5,
                 1-4919-2723-2 (e-book), 1-4919-2725-9 (e-book)",
  ISBN-13 =      "978-1-4919-2728-1 (paperback), 978-1-4919-2727-4,
                 978-1-4919-2723-6 (e-book), 978-1-4919-2725-0
                 (e-book)",
  LCCN =         "QA76.73.R88 B53 2017",
  bibdate =      "Mon Dec 9 15:37:10 MST 2019",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/ora.bib",
  URL =          "http://proquest.safaribooksonline.com/9781491927274",
  abstract =     "Rust is a new systems programming language that
                 combines the performance and low-level control of C and
                 C++ with memory safety and thread safety. Rust's
                 modern, flexible types ensure your program is free of
                 null pointer dereferences, double frees, dangling
                 pointers, and similar bugs, all at compile time,
                 without runtime overhead. In multithreaded code, Rust
                 catches data races at compile time, making concurrency
                 much easier to use. Written by two experienced systems
                 programmers, this book explains how Rust manages to
                 bridge the gap between performance and safety, and how
                 you can take advantage of it. Topics include: How Rust
                 represents values in memory (with diagrams) Complete
                 explanations of ownership, moves, borrows, and
                 lifetimes Cargo, rustdoc, unit tests, and how to
                 publish your code on crates.io, Rust's public package
                 repository High-level features like generic code,
                 closures, collections, and iterators that make Rust
                 productive and flexible Concurrency in Rust: threads,
                 mutexes, channels, and atomics, all much safer to use
                 than in C or C++ Unsafe code, and how to preserve the
                 integrity of ordinary code that uses it. Extended
                 examples illustrating how pieces of the language fit
                 together.",
  acknowledgement = ack-nhfb,
  libnote =      "Not in my library.",
  subject =      "UNIX (Computer file); UNIX (Computer file); C
                 (Computer program language); Text editors (Computer
                 programs); Software engineering; C (Computer program
                 language); Software engineering.; Text editors
                 (Computer programs)",
  tableofcontents = "Preface \\
                 Who Should Read This Book \\
                 Why We Wrote This Book \\
                 Navigating This Book \\
                 Conventions Used in This Book \\
                 Using Code Examples \\
                 O Reilly Safari \\
                 How to Contact Us \\
                 Acknowledgments \\
                 1. Why Rust? \\
                 Type Safety \\
                 2. A Tour of Rust \\
                 Downloading and Installing Rust \\
                 A Simple Function \\
                 Writing and Running Unit Tests \\
                 Handling Command-Line Arguments \\
                 A Simple Web Server \\
                 Concurrency \\
                 What the Mandelbrot Set Actually Is \\
                 Parsing Pair Command-Line Arguments \\
                 Mapping from Pixels to Complex Numbers \\
                 Plotting the Set \\
                 Writing Image Files \\
                 A Concurrent Mandelbrot Program \\
                 Running the Mandelbrot Plotter \\
                 Safety Is Invisible \\
                 3. Basic Types \\
                 Machine Types \\
                 Integer Types \\
                 Floating-Point Types \\
                 The bool Type \\
                 Characters \\
                 Tuples \\
                 Pointer Types \\
                 References \\
                 Boxes \\
                 Raw Pointers \\
                 Arrays, Vectors, and Slices \\
                 Arrays \\
                 Vectors \\
                 Slices \\
                 String Types \\
                 String Literals \\
                 Byte Strings \\
                 Strings in Memory \\
                 String \\
                 Using Strings \\
                 Other String-Like Types \\
                 Beyond the Basics \\
                 4. Ownership \\
                 Ownership \\
                 Moves \\
                 More Operations That Move \\
                 Moves and Control Flow \\
                 Moves and Indexed Content \\
                 Copy Types: The Exception to Moves \\
                 Rc and Arc: Shared Ownership \\
                 5. References \\
                 References as Values \\
                 Rust References Versus C++ References \\
                 Assigning References \\
                 References to References \\
                 Comparing References \\
                 References Are Never Null \\
                 Borrowing References to Arbitrary Expressions \\
                 References to Slices and Trait Objects \\
                 Reference Safety \\
                 Borrowing a Local Variable \\
                 Receiving References as Parameters \\
                 Passing References as Arguments \\
                 Returning References \\
                 Structs Containing References \\
                 Distinct Lifetime Parameters \\
                 Omitting Lifetime Parameters \\
                 Sharing Versus Mutation \\
                 Taking Arms Against a Sea of Objects \\
                 6. Expressions \\
                 An Expression Language \\
                 Blocks and Semicolons \\
                 Declarations \\
                 if and match \\
                 if let \\
                 Loops \\
                 return Expressions \\
                 Why Rust Has loop \\
                 Function and Method Calls \\
                 Fields and Elements \\
                 Reference Operators \\
                 Arithmetic, Bitwise, Comparison, and Logical Operators
                 \\
                 Assignment \\
                 Type Casts \\
                 Closures \\
                 Precedence and Associativity \\
                 Onward \\
                 7. Error Handling \\
                 Panic \\
                 Unwinding \\
                 Aborting \\
                 Result \\
                 Catching Errors \\
                 Result Type Aliases \\
                 Printing Errors \\
                 Propagating Errors \\
                 Working with Multiple Error Types \\
                 Dealing with Errors That Can t Happen \\
                 Ignoring Errors \\
                 Handling Errors in main() \\
                 Declaring a Custom Error Type \\
                 Why Results? \\
                 8. Crates and Modules \\
                 Crates \\
                 Build Profiles \\
                 Modules \\
                 Modules in Separate Files \\
                 Paths and Imports \\
                 The Standard Prelude \\
                 Items, the Building Blocks of Rust \\
                 Turning a Program into a Library \\
                 The src/bin Directory \\
                 Attributes \\
                 Tests and Documentation \\
                 Integration Tests \\
                 Documentation \\
                 Doc-Tests \\
                 Specifying Dependencies \\
                 Versions \\
                 Cargo.lock \\
                 Publishing Crates to crates.io \\
                 Workspaces \\
                 More Nice Things \\
                 9. Structs \\
                 Named-Field Structs \\
                 Tuple-Like Structs \\
                 Unit-Like Structs \\
                 Struct Layout \\
                 Defining Methods with impl \\
                 Generic Structs \\
                 Structs with Lifetime Parameters \\
                 Deriving Common Traits for Struct Types \\
                 Interior Mutability \\
                 10. Enums and Patterns \\
                 Enums \\
                 Enums with Data \\
                 Enums in Memory \\
                 Rich Data Structures Using Enums \\
                 Generic Enums \\
                 Patterns \\
                 Literals, Variables, and Wildcards in Patterns \\
                 Tuple and Struct Patterns \\
                 Reference Patterns \\
                 Matching Multiple Possibilities \\
                 Pattern Guards \\
                 @ patterns \\
                 Where Patterns Are Allowed \\
                 Populating a Binary Tree \\
                 The Big Picture \\
                 11. Traits and Generics \\
                 Using Traits \\
                 Trait Objects \\
                 Trait Object Layout \\
                 Generic Functions \\
                 Which to Use \\
                 Defining and Implementing Traits \\
                 Default Methods \\
                 Traits and Other People s Types \\
                 Self in Traits \\
                 Subtraits \\
                 Static Methods \\
                 Fully Qualified Method Calls \\
                 Traits That Define Relationships Between Types \\
                 Associated Types (or How Iterators Work) \\
                 Generic Traits (or How Operator Overloading Works) \\
                 Buddy Traits (or How rand::random() Works) \\
                 Reverse-Engineering Bounds \\
                 Conclusion \\
                 12. Operator Overloading \\
                 Arithmetic and Bitwise Operators \\
                 Unary Operators \\
                 Binary Operators \\
                 Compound Assignment Operators \\
                 Equality Tests \\
                 Ordered Comparisons \\
                 Index and IndexMut \\
                 Other Operators \\
                 13. Utility Traits \\
                 Drop \\
                 Sized \\
                 Clone \\
                 Copy \\
                 Deref and DerefMut \\
                 Default \\
                 AsRef and AsMut \\
                 Borrow and BorrowMut \\
                 From and Into \\
                 ToOwned \\
                 Borrow and ToOwned at Work: The Humble Cow \\
                 14. Closures \\
                 Capturing Variables \\
                 Closures That Borrow \\
                 Closures That Steal \\
                 Function and Closure Types \\
                 Closure Performance \\
                 Closures and Safety \\
                 Closures That Kill \\
                 FnOnce \\
                 FnMut \\
                 Callbacks \\
                 Using Closures Effectively \\
                 15. Iterators \\
                 The Iterator and IntoIterator Traits \\
                 Creating Iterators \\
                 iter and iter_mut Methods \\
                 IntoIterator Implementations \\
                 drain Methods \\
                 Other Iterator Sources \\
                 Iterator Adapters \\
                 map and filter \\
                 filter_map and flat_map \\
                 scan \\
                 take and take_while \\
                 skip and skip_while \\
                 peekable \\
                 fuse \\
                 Reversible Iterators and rev \\
                 inspect \\
                 chain \\
                 enumerate \\
                 zip \\
                 by_ref \\
                 cloned \\
                 cycle \\
                 Consuming Iterators \\
                 Simple Accumulation: count, sum, product \\
                 max, min \\
                 max_by, min_by \\
                 max_by_key, min_by_key \\
                 Comparing Item Sequences \\
                 any and all \\
                 position, rposition, and ExactSizeIterator \\
                 fold \\
                 nth \\
                 last \\
                 find \\
                 Building Collections: collect and FromIterator \\
                 The Extend Trait \\
                 partition \\
                 Implementing Your Own Iterators \\
                 16. Collections \\
                 Overview \\
                 Vec<T> \\
                 Accessing Elements \\
                 Iteration \\
                 Growing and Shrinking Vectors \\
                 Joining \\
                 Splitting \\
                 Swapping \\
                 Sorting and Searching \\
                 Comparing Slices \\
                 Random Elements \\
                 Rust Rules Out Invalidation Errors \\
                 VecDeque<T> \\
                 LinkedList<T> \\
                 BinaryHeap<T> \\
                 HashMap<K, V> and BTreeMap<K, V> \\
                 Entries \\
                 Map Iteration \\
                 HashSet<T> and BTreeSet<T> \\
                 Set Iteration \\
                 When Equal Values Are Different \\
                 Whole-Set Operations \\
                 Hashing \\
                 Using a Custom Hashing Algorithm \\
                 Beyond the Standard Collections \\
                 17. Strings and Text \\
                 Some Unicode Background \\
                 ASCII, Latin-1, and Unicode \\
                 UTF-8 \\
                 Text Directionality \\
                 Characters (char) \\
                 Classifying Characters \\
                 Handling Digits \\
                 Case Conversion for Characters \\
                 Conversions to and from Integers \\
                 String and str \\
                 Creating String Values \\
                 Simple Inspection \\
                 Appending and Inserting Text \\
                 Removing Text \\
                 Conventions for Searching and Iterating \\
                 Patterns for Searching Text \\
                 Searching and Replacing \\
                 Iterating over Text \\
                 Trimming \\
                 Case Conversion for Strings \\
                 Parsing Other Types from Strings \\
                 Converting Other Types to Strings \\
                 Borrowing as Other Text-Like Types \\
                 Accessing Text as UTF-8 \\
                 Producing Text from UTF-8 Data \\
                 Putting Off Allocation \\
                 Strings as Generic Collections \\
                 Formatting Values \\
                 Formatting Text Values \\
                 Formatting Numbers \\
                 Formatting Other Types \\
                 Formatting Values for Debugging \\
                 Formatting Pointers for Debugging \\
                 Referring to Arguments by Index or Name \\
                 Dynamic Widths and Precisions \\
                 Formatting Your Own Types \\
                 Using the Formatting Language in Your Own Code \\
                 Regular Expressions \\
                 Basic Regex Use \\
                 Building Regex Values Lazily \\
                 Normalization \\
                 Normalization Forms \\
                 The unicode-normalization Crate \\
                 18. Input and Output \\
                 Readers and Writers \\
                 Readers \\
                 Buffered Readers \\
                 Reading Lines \\
                 Collecting Lines \\
                 Writers \\
                 Files \\
                 Seeking \\
                 Other Reader and Writer Types \\
                 Binary Data, Compression, and Serialization \\
                 Files and Directories \\
                 OsStr and Path \\
                 Path and PathBuf Methods \\
                 Filesystem Access Functions \\
                 Reading Directories \\
                 Platform-Specific Features \\
                 Networking \\
                 19. Concurrency \\
                 Fork-Join Parallelism \\
                 spawn and join \\
                 Error Handling Across Threads \\
                 Sharing Immutable Data Across Threads \\
                 Rayon \\
                 Revisiting the Mandelbrot Set \\
                 Channels \\
                 Sending Values \\
                 Receiving Values \\
                 Running the Pipeline \\
                 Channel Features and Performance \\
                 Thread Safety: Send and Sync \\
                 Piping Almost Any Iterator to a Channel \\
                 Beyond Pipelines \\
                 Shared Mutable State \\
                 What Is a Mutex? \\
                 Mutex<T> \\
                 mut and Mutex \\
                 Why Mutexes Are Not Always a Good Idea \\
                 Deadlock \\
                 Poisoned Mutexes \\
                 Multi-Consumer Channels Using Mutexes \\
                 Read/Write Locks (RwLock<T>) \\
                 Condition Variables (Condvar) \\
                 Atomics \\
                 Global Variables \\
                 What Hacking Concurrent Code in Rust Is Like \\
                 20. Macros \\
                 Macro Basics \\
                 Basics of Macro Expansion \\
                 Unintended Consequences \\
                 Repetition \\
                 Built-In Macros \\
                 Debugging Macros \\
                 The json! Macro \\
                 Fragment Types \\
                 Recursion in Macros \\
                 Using Traits with Macros \\
                 Scoping and Hygiene \\
                 Importing and Exporting Macros \\
                 Avoiding Syntax Errors During Matching \\
                 Beyond macro_rules! \\
                 21. Unsafe Code \\
                 Unsafe from What? \\
                 Unsafe Blocks \\
                 Example: An Efficient ASCII String Type \\
                 Unsafe Functions \\
                 Unsafe Block or Unsafe Function? \\
                 Undefined Behavior \\
                 Unsafe Traits \\
                 Raw Pointers \\
                 Dereferencing Raw Pointers Safely \\
                 Example: RefWithFlag \\
                 Nullable Pointers \\
                 Type Sizes and Alignments \\
                 Pointer Arithmetic \\
                 Moving into and out of Memory \\
                 Example: GapBuffer \\
                 Panic Safety in Unsafe Code \\
                 Foreign Functions: Calling C and C++ from Rust \\
                 Finding Common Data Representations \\
                 Declaring Foreign Functions and Variables \\
                 Using Functions from Libraries \\
                 A Raw Interface to libgit2 \\
                 A Safe Interface to libgit2 \\
                 Conclusion \\
                 Index",
}

@Article{Bujanovic:2017:HBA,
  author =       "Zvonimir Bujanovi{\'c} and Lars Karlsson and Daniel
                 Kressner",
  title =        "A {Householder}-based algorithm for
                 {Hessenberg}-triangular reduction",
  journal =      "arxiv.org",
  volume =       "??",
  number =       "??",
  pages =        "??--??",
  day =          "23",
  month =        oct,
  year =         "2017",
  bibdate =      "Fri Dec 21 10:00:58 2018",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/h/householder-alston-s.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://arxiv.org/abs/1710.08538",
  abstract =     "The QZ algorithm for computing eigenvalues and
                 eigenvectors of a matrix pencil A B requires that the
                 matrices first be reduced to Hessenberg-triangular (HT)
                 form. The current method of choice for HT reduction
                 relies entirely on Givens rotations regrouped and
                 accumulated into small dense matrices which are
                 subsequently applied using matrix multiplication
                 routines. A non-vanishing fraction of the total flop
                 count must nevertheless still be performed as sequences
                 of overlapping Givens rotations alternately applied
                 from the left and from the right. The many data
                 dependencies associated with this computational pattern
                 leads to inefficient use of the processor and poor
                 scalability. In this paper, we therefore introduce a
                 fundamentally different approach that relies entirely
                 on (large) Householder reflectors partially accumulated
                 into block reflectors, by using (compact) WY
                 representations. Even though the new algorithm requires
                 more floating point operations than the state of the
                 art algorithm, extensive experiments on both real and
                 synthetic data indicate that it is still competitive,
                 even in a sequential setting. The new algorithm is
                 conjectured to have better parallel scalability, an
                 idea which is partially supported by early small-scale
                 experiments using multi-threaded BLAS. The design and
                 evaluation of a parallel formulation is future work.",
  acknowledgement = ack-nhfb,
}

@Article{Cao:2017:HRD,
  author =       "Man Cao and Minjia Zhang and Aritra Sengupta and
                 Swarnendu Biswas and Michael D. Bond",
  title =        "Hybridizing and Relaxing Dependence Tracking for
                 Efficient Parallel Runtime Support",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108138",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "It is notoriously challenging to develop parallel
                 software systems that are both scalable and correct.
                 Runtime support for parallelism-such as multithreaded
                 record and replay, data race detectors, transactional
                 memory, and enforcement of stronger memory models-helps
                 achieve these goals, but existing commodity solutions
                 slow programs substantially to track (i.e., detect or
                 control) an execution's cross-thread dependencies
                 accurately. Prior work tracks cross-thread dependencies
                 either ``pessimistically,'' slowing every program
                 access, or ``optimistically,'' allowing for lightweight
                 instrumentation of most accesses but dramatically
                 slowing accesses that are conflicting (i.e., involved
                 in cross-thread dependencies). This article presents
                 two novel approaches that seek to improve the
                 performance of dependence tracking. Hybrid tracking
                 (HT) hybridizes pessimistic and optimistic tracking by
                 overcoming a fundamental mismatch between these two
                 kinds of tracking. HT uses an adaptive, profile-based
                 policy to make runtime decisions about switching
                 between pessimistic and optimistic tracking. Relaxed
                 tracking (RT) attempts to reduce optimistic tracking's
                 overhead on conflicting accesses by tracking
                 dependencies in a ``relaxed'' way-meaning that not all
                 dependencies are tracked accurately-while still
                 preserving both program semantics and runtime support's
                 correctness. To demonstrate the usefulness and
                 potential of HT and RT, we build runtime support based
                 on the two approaches. Our evaluation shows that both
                 approaches offer performance advantages over existing
                 approaches, but there exist challenges and
                 opportunities for further improvement. HT and RT are
                 distinct solutions to the same problem. It is easier to
                 build runtime support based on HT than on RT, although
                 RT does not incur the overhead of online profiling.
                 This article presents the two approaches together to
                 inform and inspire future designs for efficient
                 parallel runtime support.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Catalan:2017:TEM,
  author =       "Sandra Catal{\'a}n and Francisco D. Igual and Rafael
                 Mayo and Rafael Rodr{\'\i}guez-S{\'a}nchez and Enrique
                 S. Quintana-Ort{\'\i}",
  title =        "Time and energy modeling of a high-performance
                 multi-threaded {Cholesky} factorization",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "1",
  pages =        "139--151",
  month =        jan,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1654-6",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:31 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/1;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Chen:2017:IGP,
  author =       "Li-Jhan Chen and Hsiang-Yun Cheng and Po-Han Wang and
                 Chia-Lin Yang",
  title =        "Improving {GPGPU} Performance via Cache Locality Aware
                 Thread Block Scheduling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "127--131",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2693371",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Modern GPGPUs support the concurrent execution of
                 thousands of threads to provide an energy-efficient
                 platform. However, the massive multi-threading of
                 GPGPUs incurs serious cache contention, as the cache
                 lines brought by one thread can easily be evicted by
                 other threads in the small shared cache. In this paper,
                 we propose a software-hardware cooperative approach
                 that exploits the spatial locality among different
                 thread blocks to better utilize the precious cache
                 capacity. Through dynamic locality estimation and
                 thread block scheduling, we can capture more
                 performance improvement opportunities than prior work
                 that only explores the spatial locality between
                 consecutive thread blocks. Evaluations across diverse
                 GPGPU applications show that, on average, our
                 locality-aware scheduler provides 25 and 9 percent
                 performance improvement over the commonly-employed
                 round-robin scheduler and the state-of-the-art
                 scheduler, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, LJ (Reprint Author), Natl Taiwan Univ, Taipei
                 10617, Taiwan. Chen, Li-Jhan; Wang, Po-Han; Yang,
                 Chia-Lin, Natl Taiwan Univ, Taipei 10617, Taiwan.
                 Cheng, Hsiang-Yun, Acad Sinica, Taipei 11529, Taiwan.",
  author-email = "r03922026@csie.ntu.edu.tw hycheng@citi.sinica.edu.tw
                 f96922002@csie.ntu.edu.tw yangc@csie.ntu.edu.tw",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Ministry of Science and Technology of
                 Taiwan [MOST-105-2221-E-002-156-MY2,
                 MOST-105-2622-8-002-002, MOST-105-2218-E-002-025];
                 MediaTek Inc., Hsin-chu, Taiwan",
  funding-text = "This work is supported in part by research grants from
                 the Ministry of Science and Technology of Taiwan
                 (MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002,
                 and MOST-105-2218-E-002-025), and sponsored by MediaTek
                 Inc., Hsin-chu, Taiwan.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache locality; GPGPU; thread block scheduling",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Chen:2017:IGP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Cui:2017:MTA,
  author =       "Huanqing Cui and Jian Niu and Chuanai Zhou and Minglei
                 Shu",
  title =        "A Multi-Threading Algorithm to Detect and Remove
                 Cycles in Vertex- and Arc-Weighted Digraph",
  journal =      j-ALGORITHMS-BASEL,
  volume =       "10",
  number =       "4",
  month =        dec,
  year =         "2017",
  CODEN =        "ALGOCH",
  DOI =          "https://doi.org/10.3390/a10040115",
  ISSN =         "1999-4893 (electronic)",
  ISSN-L =       "1999-4893",
  bibdate =      "Fri May 3 13:50:13 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/algorithms.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.mdpi.com/1999-4893/10/4/115",
  acknowledgement = ack-nhfb,
  articleno =    "115",
  fjournal =     "Algorithms (Basel)",
  journal-URL =  "https://www.mdpi.com/journal/algorithms",
  ORCID-numbers = "Huanqing Cui/0000-0002-9251-680X",
  pagecount =    "??",
  pubdates =     "Received: 28 August 2017 / Revised: 26 September 2017
                 / Accepted: 9 October 2017 / Published: 10 October
                 2017",
}

@Article{Dang:2017:ECB,
  author =       "Hoang-Vu Dang and Marc Snir and William Gropp",
  title =        "Eliminating contention bottlenecks in multithreaded
                 {MPI}",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "69",
  number =       "??",
  pages =        "1--23",
  month =        nov,
  year =         "2017",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Tue Oct 24 15:15:02 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819117301187",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Dutta:2017:SVC,
  author =       "Sudakshina Dutta and Dipankar Sarkar and Arvind
                 Rawat",
  title =        "Synchronization Validation for Cross-Thread
                 Dependences in Parallel Programs",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "45",
  number =       "6",
  pages =        "1326--1365",
  month =        dec,
  year =         "2017",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-016-0467-9",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Sat Nov 18 09:27:28 MST 2017",
  bibsource =    "http://link.springer.com/journal/10766/45/6;
                 https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Farzan:2017:SDC,
  author =       "Azadeh Farzan and Victor Nicolet",
  title =        "Synthesis of divide and conquer parallelism for
                 loops",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "540--555",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062355",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Divide-and-conquer is a common parallel programming
                 skeleton supported by many cross-platform multithreaded
                 libraries, and most commonly used by programmers for
                 parallelization. The challenges of producing (manually
                 or automatically) a correct divide-and-conquer parallel
                 program from a given sequential code are two-fold: (1)
                 assuming that a good solution exists where individual
                 worker threads execute a code identical to the
                 sequential one, the programmer has to provide the extra
                 code for dividing the tasks and combining the partial
                 results (i.e. joins), and (2) the sequential code may
                 not be suitable for divide-and-conquer parallelization
                 as is, and may need to be modified to become a part of
                 a good solution. We address both challenges in this
                 paper. We present an automated synthesis technique to
                 synthesize correct joins and an algorithm for modifying
                 the sequential code to make it suitable for
                 parallelization when necessary. This paper focuses on
                 class of loops that traverse a read-only collection and
                 compute a scalar function over that collection. We
                 present theoretical results for when the necessary
                 modifications to sequential code are possible,
                 theoretical guarantees for the algorithmic solutions
                 presented here, and experimental evaluation of the
                 approach's success in practice and the quality of the
                 produced parallel programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Feliu:2017:PFP,
  author =       "J. Feliu and J. Sahuquillo and S. Petit and J. Duato",
  title =        "{Perf Fair}: A Progress-Aware Scheduler to Enhance
                 Performance and Fairness in {SMT} Multicores",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "66",
  number =       "5",
  pages =        "905--911",
  month =        may,
  year =         "2017",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2016.2620977",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Apr 6 07:46:06 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "Bandwidth; Estimation; fairness; Interference;
                 multicore; Multicore processing; performance
                 estimation; Processor scheduling; Program processors;
                 Resource management; Scheduling; SMT",
}

@Article{Gasiunas:2017:FBA,
  author =       "Vaidas Gasiunas and David Dominguez-Sal and Ralph
                 Acker and Aharon Avitzur and Ilan Bronshtein and Rushan
                 Chen and Eli Ginot and Norbert Martinez-Bazan and
                 Michael M{\"u}ller and Alexander Nozdrin and Weijie Ou
                 and Nir Pachter and Dima Sivov and Eliezer Levy",
  title =        "Fiber-based architecture for {NFV} cloud databases",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "10",
  number =       "12",
  pages =        "1682--1693",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3137765.3137774",
  ISSN =         "2150-8097",
  bibdate =      "Tue Oct 10 17:16:19 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "The telco industry is gradually shifting from using
                 monolithic software packages deployed on custom
                 hardware to using modular virtualized software
                 functions deployed on cloudified data centers using
                 commodity hardware. This transformation is referred to
                 as Network Function Virtualization (NFV). The
                 scalability of the databases (DBs) underlying the
                 virtual network functions is the cornerstone for
                 reaping the benefits from the NFV transformation. This
                 paper presents an industrial experience of applying
                 shared-nothing techniques in order to achieve the
                 scalability of a DB in an NFV setup. The special
                 combination of requirements in NFV DBs are not easily
                 met with conventional execution models. Therefore, we
                 designed a special shared-nothing architecture that is
                 based on cooperative multi-tasking using user-level
                 threads (fibers). We further show that the fiber-based
                 approach outperforms the approach built using
                 conventional multi-threading and meets the variable
                 deployment needs of the NFV transformation.
                 Furthermore, fibers yield a simpler-to-maintain
                 software and enable controlling a trade-off between
                 long-duration computations and real-time requests.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Georgakoudis:2017:SSA,
  author =       "Giorgis Georgakoudis and Hans Vandierendonck and Peter
                 Thoman and Bronis R. {De Supinski} and Thomas Fahringer
                 and Dimitrios S. Nikolopoulos",
  title =        "{SCALO}: Scalability-Aware Parallelism Orchestration
                 for Multi-Threaded Workloads",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158643",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Shared memory machines continue to increase in scale
                 by adding more parallelism through additional cores and
                 complex memory hierarchies. Often, executing multiple
                 applications concurrently, dividing among them hardware
                 threads, provides greater efficiency rather than
                 executing a single application with large thread
                 counts. However, contention for shared resources can
                 limit the improvement of concurrent application
                 execution: orchestrating the number of threads used by
                 each application and is essential. In this article, we
                 contribute SCALO, a solution to orchestrate concurrent
                 application execution to increase throughput. SCALO
                 monitors co-executing applications at runtime to
                 evaluate their scalability. Its optimizing thread
                 allocator analyzes these scalability estimates to adapt
                 the parallelism of each program. Unlike previous
                 approaches, SCALO differs by including dynamic
                 contention effects on scalability and by controlling
                 the parallelism during the execution of parallel
                 regions. Thus, it improves throughput when other
                 state-of-the-art approaches fail and outperforms them
                 by up to 40\% when they succeed.",
  acknowledgement = ack-nhfb,
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Georgiou:2017:ETD,
  author =       "Kyriakos Georgiou and Steve Kerrison and Zbigniew
                 Chamski and Kerstin Eder",
  title =        "Energy Transparency for Deeply Embedded Programs",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046679",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Energy transparency is a concept that makes a
                 program's energy consumption visible, from hardware up
                 to software, through the different system layers. Such
                 transparency can enable energy optimizations at each
                 layer and between layers, as well as help both
                 programmers and operating systems make energy-aware
                 decisions. In this article, we focus on deeply embedded
                 devices, typically used for Internet of Things (IoT)
                 applications, and demonstrate how to enable energy
                 transparency through existing static resource analysis
                 (SRA) techniques and a new target-agnostic profiling
                 technique, without hardware energy measurements. Our
                 novel mapping technique enables software energy
                 consumption estimations at a higher level than the
                 Instruction Set Architecture (ISA), namely the LLVM
                 intermediate representation (IR) level, and therefore
                 introduces energy transparency directly to the LLVM
                 optimizer. We apply our energy estimation techniques to
                 a comprehensive set of benchmarks, including single-
                 and multithreaded embedded programs from two commonly
                 used concurrency patterns: task farms and pipelines.
                 Using SRA, our LLVM IR results demonstrate a high
                 accuracy with a deviation in the range of 1\% from the
                 ISA SRA. Our profiling technique captures the actual
                 energy consumption at the LLVM IR level with an average
                 error of 3\%.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gupta:2017:DDP,
  author =       "Ujjwal Gupta and Chetan Arvind Patil and Ganapati Bhat
                 and Prabhat Mishra and Umit Y. Ogras",
  title =        "{DyPO}: Dynamic {Pareto}-Optimal Configuration
                 Selection for Heterogeneous {MpSoCs}",
  journal =      j-TECS,
  volume =       "16",
  number =       "5s",
  pages =        "123:1--123:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3126530",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:33 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "Modern multiprocessor systems-on-chip (MpSoCs) offer
                 tremendous power and performance optimization
                 opportunities by tuning thousands of potential voltage,
                 frequency and core configurations. As the workload
                 phases change at runtime, different configurations may
                 become optimal with respect to power, performance or
                 other metrics. Identifying the optimal configuration at
                 runtime is infeasible due to the large number of
                 workloads and configurations. This paper proposes a
                 novel methodology that can find the Pareto-optimal
                 configurations at runtime as a function of the
                 workload. To achieve this, we perform an extensive
                 offline characterization to find classifiers that map
                 performance counters to optimal configurations. Then,
                 we use these classifiers and performance counters at
                 runtime to choose Pareto-optimal configurations. We
                 evaluate the proposed methodology by maximizing the
                 performance per watt for 18 single- and multi-threaded
                 applications. Our experiments demonstrate an average
                 increase of 93\%, 81\% and 6\% in performance per watt
                 compared to the interactive, on demand and powersave
                 governors, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "123",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Hankendi:2017:SCS,
  author =       "Can Hankendi and Ayse Kivilcim Coskun",
  title =        "Scale \& Cap: Scaling-Aware Resource Management for
                 Consolidated Multi-threaded Applications",
  journal =      j-TODAES,
  volume =       "22",
  number =       "2",
  pages =        "30:1--30:??",
  month =        mar,
  year =         "2017",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/2994145",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Fri Jul 21 10:49:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/todaes.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "As the number of cores per server node increases,
                 designing multi-threaded applications has become
                 essential to efficiently utilize the available hardware
                 parallelism. Many application domains have started to
                 adopt multi-threaded programming; thus, efficient
                 management of multi-threaded applications has become a
                 significant research problem. Efficient execution of
                 multi-threaded workloads on cloud environments, where
                 applications are often consolidated by means of
                 virtualization, relies on understanding the
                 multi-threaded specific characteristics of the
                 applications. Furthermore, energy cost and power
                 delivery limitations require data center server nodes
                 to work under power caps, which bring additional
                 challenges to runtime management of consolidated
                 multi-threaded applications. This article proposes a
                 dynamic resource allocation technique for consolidated
                 multi-threaded applications for power-constrained
                 environments. Our technique takes into account
                 application characteristics specific to multi-threaded
                 applications, such as power and performance scaling, to
                 make resource distribution decisions at runtime to
                 improve the overall performance, while accurately
                 tracking dynamic power caps. We implement and evaluate
                 our technique on state-of-the-art servers and show that
                 the proposed technique improves the application
                 performance by up to 21\% under power caps compared to
                 a default resource manager.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Hroub:2017:EGC,
  author =       "Ayman Hroub and M. E. S. Elrabaa and M. F. Mudawar and
                 A. Khayyat",
  title =        "Efficient Generation of Compact Execution Traces for
                 Multicore Architectural Simulations",
  journal =      j-TACO,
  volume =       "14",
  number =       "3",
  pages =        "27:1--27:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106342",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Sep 6 17:12:05 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Requiring no functional simulation, trace-driven
                 simulation has the potential of achieving faster
                 simulation speeds than execution-driven simulation of
                 multicore architectures. An efficient, on-the-fly,
                 high-fidelity trace generation method for multithreaded
                 applications is reported. The generated trace is
                 encoded in an instruction-like binary format that can
                 be directly ``interpreted'' by a timing simulator to
                 simulate a general load/store or x8-like architecture.
                 A complete tool suite that has been developed and used
                 for evaluation of the proposed method showed that it
                 produces smaller traces over existing trace compression
                 methods while retaining good fidelity including all
                 threading- and synchronization-related events.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jung:2017:LSD,
  author =       "Sungbo Jung and Dar-Jen Chang and Juw Won Park",
  title =        "Large scale document inversion using a multi-threaded
                 computing system",
  journal =      j-SIGAPP,
  volume =       "17",
  number =       "2",
  pages =        "27--35",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131080.3131083",
  ISSN =         "1559-6915 (print), 1931-0161 (electronic)",
  ISSN-L =       "1559-6915",
  bibdate =      "Thu Jan 23 10:25:03 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigapp.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3131080.3131083",
  abstract =     "Current microprocessor architecture is moving towards
                 multi-core/multi-threaded systems. This trend has led
                 to a surge of interest in using multi-threaded
                 computing devices, such as the Graphics Processing Unit
                 (GPU), for general purpose computing. We \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGAPP Applied Computing Review",
  journal-URL =  "https://dl.acm.org/loi/sigapp",
}

@Book{Klabnik:2017:RPL,
  author =       "Steve Klabnik and Carol Nichols",
  title =        "The {Rust} Programming Language",
  publisher =    pub-NO-STARCH,
  address =      pub-NO-STARCH:adr,
  pages =        "xxvii + 519",
  year =         "2017",
  ISBN =         "1-59327-828-4 (paperback), 1-59327-851-9 (e-pub)",
  ISBN-13 =      "978-1-59327-828-1 (paperback), 978-1-59327-851-9
                 (e-pub)",
  LCCN =         "QA76.73.R87 K53 2018",
  bibdate =      "Thu Oct 31 18:42:15 MDT 2019",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "\booktitle{The Rust Programming Language} is the
                 official book on Rust; a community-developed, systems
                 programming language that runs blazingly fast, prevents
                 segfaults, and guarantees thread safety. Rust's memory
                 safety guarantees, enforced at compile time, safeguard
                 your programs against the many problems that pervade
                 other systems languages. Rust offers the control and
                 performance of a low-level language with the helpful
                 abstractions of a high level one, and does this all
                 without having a garbage collector. These
                 characteristics make Rust useful for embedding in other
                 languages, programs with specific space and time
                 requirements, and writing low-level code, like device
                 drivers and operating systems. \booktitle{The Rust
                 Programming Language} begins with a quick hands-on
                 project to introduce the basics, then explores key
                 concepts in depth, such as ownership, the type system,
                 error handling, and fearless concurrency. Detailed
                 explanations of Rust-oriented takes on topics like
                 pattern matching, iterators, and smart pointers combine
                 with examples and exercises to take you from theory to
                 practice. In addition to its thorough coverage of more
                 granular topics, \booktitle{The Rust Programming
                 Language} will show you how to: * Grasp important
                 concepts unique to Rust like ownership, borrowing, and
                 lifetimes; * Use Cargo, Rust's built-in package
                 manager, to build your code, including downloading and
                 building dependencies; * Effectively use Rust's
                 zero-cost abstractions and learn to build your own.
                 Developed with help from the community, \booktitle{The
                 Rust Programming Language} is your official guide to
                 becoming a productive Rust programmer. The official
                 guide to Rust, a community-developed, systems
                 programming language. Begins with a hands-on project to
                 introduce the basics, then explores key concepts in
                 depth''",
  acknowledgement = ack-nhfb,
  libnote =      "Not in my library.",
  subject =      "Computer programming; Programming languages
                 (Electronic computers); Computer programming.;
                 Programming languages (Electronic computers)",
  tableofcontents = "Foreword / by Nicholas Matsakis and Aaron Turon \\
                 Introduction \\
                 1: Getting Started \\
                 2: A Quick Tutorial \\
                 Guessing Game \\
                 3: Common Programming Concepts \\
                 4: Understanding Ownership \\
                 5: Structs \\
                 6: Enums and Pattern Matching \\
                 7: Modules \\
                 8: Common Collections \\
                 9: Error Handling \\
                 10: Generic Types, Traits, and Lifetimes \\
                 11: Testing \\
                 12: An Input\slash Output Project \\
                 13: Functional Language Features in Rust \\
                 Iterators and Closures \\
                 14: More about Cargo and Crates io \\
                 15: Smart Pointers \\
                 16: Concurrency \\
                 17: Is Rust Object Oriented? \\
                 18: Patterns \\
                 19: More About Lifetimes \\
                 20: Advanced Type System Features \\
                 Appendix A: Keywords \\
                 Appendix B: Operators \\
                 Appendix C: Derivable Traits \\
                 Appendix D: Nightly Rust\ \\
                 Nightly Rust \\
                 Glossary",
}

@Article{Kleinmann:2017:ACS,
  author =       "Amit Kleinmann and Avishai Wool",
  title =        "Automatic Construction of Statechart-Based Anomaly
                 Detection Models for Multi-Threaded Industrial Control
                 Systems",
  journal =      j-TIST,
  volume =       "8",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3011018",
  ISSN =         "2157-6904 (print), 2157-6912 (electronic)",
  ISSN-L =       "2157-6904",
  bibdate =      "Sat Dec 23 10:12:41 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tist.bib",
  abstract =     "Traffic of Industrial Control System (ICS) between the
                 Human Machine Interface (HMI) and the Programmable
                 Logic Controller (PLC) is known to be highly periodic.
                 However, it is sometimes multiplexed, due to
                 asynchronous scheduling. Modeling the network traffic
                 patterns of multiplexed ICS streams using Deterministic
                 Finite Automata (DFA) for anomaly detection typically
                 produces a very large DFA and a high false-alarm rate.
                 In this article, we introduce a new modeling approach
                 that addresses this gap. Our Statechart DFA modeling
                 includes multiple DFAs, one per cyclic pattern,
                 together with a DFA-selector that de-multiplexes the
                 incoming traffic into sub-channels and sends them to
                 their respective DFAs. We demonstrate how to
                 automatically construct the statechart from a captured
                 traffic stream. Our unsupervised learning algorithms
                 first build a Discrete-Time Markov Chain (DTMC) from
                 the stream. Next, we split the symbols into sets, one
                 per multiplexed cycle, based on symbol frequencies and
                 node degrees in the DTMC graph. Then, we create a
                 sub-graph for each cycle and extract Euler cycles for
                 each sub-graph. The final statechart is comprised of
                 one DFA per Euler cycle. The algorithms allow for
                 non-unique symbols, which appear in more than one
                 cycle, and also for symbols that appear more than once
                 in a cycle. We evaluated our solution on traces from a
                 production ICS using the Siemens S7-0x72 protocol. We
                 also stress-tested our algorithms on a collection of
                 synthetically-generated traces that simulated
                 multiplexed ICS traces with varying levels of symbol
                 uniqueness and time overlap. The algorithms were able
                 to split the symbols into sets with 99.6\% accuracy.
                 The resulting statechart modeled the traces with a
                 median false-alarm rate of as low as 0.483\%. In all
                 but the most extreme scenarios, the Statechart model
                 drastically reduced both the false-alarm rate and the
                 learned model size in comparison with the naive
                 single-DFA model.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM Transactions on Intelligent Systems and Technology
                 (TIST)",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1318",
}

@Article{Kojima:2017:HLG,
  author =       "Kensuke Kojima and Atsushi Igarashi",
  title =        "A {Hoare} Logic for {GPU} Kernels",
  journal =      j-TOCL,
  volume =       "18",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001834",
  ISSN =         "1529-3785 (print), 1557-945X (electronic)",
  ISSN-L =       "1529-3785",
  bibdate =      "Thu Apr 13 17:53:54 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocl.bib",
  abstract =     "We study a Hoare Logic to reason about parallel
                 programs executed on graphics processing units (GPUs),
                 called GPU kernels. During the execution of GPU
                 kernels, multiple threads execute in lockstep, that is,
                 execute the same instruction simultaneously. When the
                 control branches, the two branches are executed
                 sequentially, but during the execution of each branch
                 only those threads that take it are enabled; after the
                 control converges, all the threads are enabled and
                 again execute in lockstep. In this article, we first
                 consider a semantics in which all threads execute in
                 lockstep (this semantics simplifies the actual
                 execution model of GPUs) and adapt Hoare Logic to this
                 setting by augmenting the usual Hoare triples with an
                 additional component representing the set of enabled
                 threads. It is determined that the soundness and
                 relative completeness of the logic do not hold for all
                 programs; a difficulty arises from the fact that one
                 thread can invalidate the loop termination condition of
                 another thread through shared memory. We overcome this
                 difficulty by identifying an appropriate class of
                 programs for which the soundness and relative
                 completeness hold. Additionally, we discuss thread
                 interleaving, which is present in the actual execution
                 of GPUs but not in the lockstep semantics mentioned
                 above. We show that if a program is race free, then the
                 lockstep and interleaving semantics produce the same
                 result. This implies that our logic is sound and
                 relatively complete for race-free programs, even if the
                 thread interleaving is taken into account.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computational Logic",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J773",
}

@Article{Komosinski:2017:MCE,
  author =       "Maciej Komosinski and Szymon Ulatowski",
  title =        "Multithreaded computing in evolutionary design and in
                 artificial life simulations",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "73",
  number =       "5",
  pages =        "2214--2228",
  month =        may,
  year =         "2017",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-016-1923-4",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Sat Jun 24 10:31:33 MDT 2017",
  bibsource =    "http://link.springer.com/journal/11227/73/5;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/content/pdf/10.1007/s11227-016-1923-4.pdf",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Kopczynski:2017:LSS,
  author =       "Eryk Kopczy{\'n}ski and Szymon Toru{\'n}czyk",
  title =        "{LOIS}: syntax and semantics",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "586--598",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009876",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the semantics of an imperative programming
                 language called LOIS (Looping Over Infinite Sets),
                 which allows iterating through certain infinite sets,
                 in finite time. Our semantics intuitively correspond to
                 execution of infinitely many threads in parallel. This
                 allows to merge the power of abstract mathematical
                 constructions into imperative programming. Infinite
                 sets are internally represented using first order
                 formulas over some underlying logical structure, and
                 SMT solvers are employed to evaluate programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Lee:2017:MVN,
  author =       "Doowon Lee and Valeria Bertacco",
  title =        "{MTraceCheck}: Validating Non-Deterministic Behavior
                 of Memory Consistency Models in Post-Silicon
                 Validation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "201--213",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080235",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This work presents a minimally-intrusive,
                 high-performance, post-silicon validation framework for
                 validating memory consistency in multi-core systems.
                 Our framework generates constrained-random tests that
                 are instrumented with observability-enhancing code for
                 memory consistency verification. For each test, we
                 generate a set of compact signatures reflecting the
                 memory-ordering patterns observed over many executions
                 of the test, with each of the signatures corresponding
                 to a unique memory-ordering pattern. We then leverage
                 an efficient and novel analysis to quickly determine if
                 the observed execution patterns represented by each
                 unique signature abide by the memory consistency model.
                 Our analysis derives its efficiency by exploiting the
                 structural similarities among the patterns observed. We
                 evaluated our framework, MTraceCheck, on two platforms:
                 an x86-based desktop and an ARM-based SoC platform,
                 both running multi-threaded test programs in a
                 bare-metal environment. We show that MTraceCheck
                 reduces the perturbation introduced by the
                 memory-ordering monitoring activity by 93\% on average,
                 compared to a baseline register flushing approach that
                 saves the register's state after each load operation.
                 We also reduce the computation requirements of our
                 consistency checking analysis by 81\% on average,
                 compared to a conventional topological sorting
                 solution. We finally demonstrate the effectiveness of
                 MTraceCheck on buggy designs, by evaluating multiple
                 case studies where it successfully exposes subtle bugs
                 in a full-system simulation environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
}

@Article{Li:2017:EML,
  author =       "Cha V. Li and Vinicius Petrucci and Daniel Moss{\'e}",
  title =        "Exploring Machine Learning for Thread Characterization
                 on Heterogeneous Multiprocessors",
  journal =      j-OPER-SYS-REV,
  volume =       "51",
  number =       "1",
  pages =        "113--123",
  month =        aug,
  year =         "2017",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/3139645.3139664",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Fri Sep 15 10:37:05 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/opersysrev.bib",
  abstract =     "We introduce a thread characterization method that
                 explores hardware performance counters and machine
                 learning techniques to automate estimating workload
                 execution on heterogeneous processors. We show that our
                 characterization scheme achieves higher accuracy when
                 predicting performance indicators, such as instructions
                 per cycle and last-level cache misses, commonly used to
                 determine the mapping of threads to processor types at
                 runtime. We also show that support vector regression
                 achieves higher accuracy when compared to linear
                 regression, and has very low (1\%) overhead. The
                 results presented in this paper can provide a
                 foundation for advanced investigations and interesting
                 new directions in intelligent thread scheduling and
                 power management on multiprocessors.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Li:2017:GGB,
  author =       "Yuxiang Li and Yinliang Zhao and Qiangsheng Wu",
  title =        "{GbA}: a graph-based thread partition approach in
                 speculative multithreading",
  journal =      j-CCPE,
  volume =       "29",
  number =       "21",
  pages =        "??--??",
  day =          "10",
  month =        nov,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4294",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Dec 30 09:11:58 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Lin:2017:MSP,
  author =       "Zhongwei Lin and Carl Tropper and Robert A. McDougal
                 and Mohammand Nazrul Ishlam Patoary and William W.
                 Lytton and Yiping Yao and Michael L. Hines",
  title =        "Multithreaded Stochastic {PDES} for Reactions and
                 Diffusions in Neurons",
  journal =      j-TOMACS,
  volume =       "27",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2017",
  CODEN =        "ATMCEZ",
  DOI =          "https://doi.org/10.1145/2987373",
  ISSN =         "1049-3301 (print), 1558-1195 (electronic)",
  ISSN-L =       "1049-3301",
  bibdate =      "Tue Jul 11 15:41:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomacs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomacs.bib",
  abstract =     "Cells exhibit stochastic behavior when the number of
                 molecules is small. Hence a stochastic
                 reaction-diffusion simulator capable of working at
                 scale can provide a more accurate view of molecular
                 dynamics within the cell. This article describes a
                 parallel discrete event simulator, Neuron Time
                 Warp-Multi Thread (NTW-MT), developed for the
                 simulation of reaction diffusion models of neurons. To
                 the best of our knowledge, this is the first parallel
                 discrete event simulator oriented toward stochastic
                 simulation of chemical reactions in a neuron. The
                 simulator was developed as part of the NEURON project.
                 NTW-MT is optimistic and thread based, which attempts
                 to capitalize on multicore architectures used in high
                 performance machines. It makes use of a multilevel
                 queue for the pending event set and a single rollback
                 message in place of individual antimessages to disperse
                 contention and decrease the overhead of processing
                 rollbacks. Global Virtual Time is computed
                 asynchronously both within and among processes to get
                 rid of the overhead for synchronizing threads. Memory
                 usage is managed in order to avoid locking and
                 unlocking when allocating and deallocating memory and
                 to maximize cache locality. We verified our simulator
                 on a calcium buffer model. We examined its performance
                 on a calcium wave model, comparing it to the
                 performance of a process based optimistic simulator and
                 a threaded simulator which uses a single priority queue
                 for each thread. Our multithreaded simulator is shown
                 to achieve superior performance to these simulators.
                 Finally, we demonstrated the scalability of our
                 simulator on a larger Calcium-Induced Calcium Release
                 (CICR) model and a more detailed CICR model.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Modeling and Computer Simulation",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J781",
}

@Article{Luo:2017:TDS,
  author =       "Hao Luo and Pengcheng Li and Chen Ding",
  title =        "Thread Data Sharing in Cache: Theory and Measurement",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "103--115",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018759",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "On modern multi-core processors, independent workloads
                 often interfere with each other by competing for shared
                 cache space. However, for multi-threaded workloads,
                 where a single copy of data can be accessed by multiple
                 threads, the threads can cooperatively share cache.
                 Because data sharing consolidates the collective
                 working set of threads, the effective size of shared
                 cache becomes larger than it would have been when data
                 are not shared. This paper presents a new theory of
                 data sharing. It includes (1) a new metric called the
                 shared footprint to mathematically compute the amount
                 of data shared by any group of threads in any size
                 cache, and (2) a linear-time algorithm to measure
                 shared footprint by scanning the memory trace of a
                 multi-threaded program. The paper presents the
                 practical implementation and evaluates the new theory
                 using 14 PARSEC and SPEC OMP benchmarks, including an
                 example use of shared footprint in program
                 optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Marquez:2017:MCH,
  author =       "David Gonzalez Marquez and Adrian Cristal Kestelman
                 and Esteban Mocskos",
  title =        "{Mth}: Codesigned Hardware\slash Software Support for
                 Fine Grain Threads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "64--67",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2606383",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multi-core processors are ubiquitous in all market
                 segments from embedded to high performance computing,
                 but only few applications can efficiently utilize them.
                 Existing parallel frameworks aim to support
                 thread-level parallelism in applications, but the
                 imposed overhead prevents their usage for small problem
                 instances. This work presents Micro-threads (Mth) a
                 hardware-software proposal focused on a shared thread
                 management model enabling the use of parallel resources
                 in applications that have small chunks of parallel code
                 or small problem inputs by a combination of software
                 and hardware: delegation of the resource control to the
                 application, an improved mechanism to store and fill
                 processor's context, and an efficient synchronization
                 system. Four sample applications are used to test our
                 proposal: HSL filter (trivially parallel), FFT Radix2
                 (recursive algorithm), LU decomposition (barrier every
                 cycle) and Dantzig algorithm (graph based, matrix
                 manipulation). The results encourage the use of Mth and
                 could smooth the use of multiple cores for applications
                 that currently can not take advantage of the
                 proliferation of the available parallel resources in
                 each chip.",
  acknowledgement = ack-nhfb,
  affiliation =  "Marquez, DG (Reprint Author), Univ Buenos Aires, Fac
                 Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
                 RA-1053 Buenos Aires, DF, Argentina. Marquez, David
                 Gonzalez; Mocskos, Esteban, Univ Buenos Aires, Fac
                 Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
                 RA-1053 Buenos Aires, DF, Argentina. Mocskos, Esteban,
                 CSC CONICET, C1425FQD, RA-2390 Buenos Aires, DF,
                 Argentina. Kestelman, Adrian Cristal, CSIC, IIIA,
                 Barcelona Supercomp Ctr, ES-08034 Barcelona, Spain.
                 Kestelman, Adrian Cristal, Univ Politecn Cataluna, Dept
                 Comp Architecture, ES-08034 Barcelona, Spain.",
  author-email = "dmarquez@dc.uba.ar adrian.cristal@bsc.es
                 emocskos@dc.uba.ar",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Universidad de Buenos Aires [UBACyT
                 20020130200096BA]; CONICET [PIP 11220110100379]",
  funding-text = "This work was partially funded by grants from
                 Universidad de Buenos Aires (UBACyT 20020130200096BA)
                 and CONICET (PIP 11220110100379). The authors thank
                 specially Osman Unsal for reading this article with
                 fruitful criticism.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "multicore processing; multithreading; Parallel
                 architectures; parallel programming",
  keywords-plus = "PARALLELISM",
  number-of-cited-references = "11",
  ORCID-numbers = "Mocskos, Esteban/0000-0002-6473-7672",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Marquez:2017:MCH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Matheou:2017:DDC,
  author =       "George Matheou and Paraskevas Evripidou",
  title =        "Data-Driven Concurrency for High Performance
                 Computing",
  journal =      j-TACO,
  volume =       "14",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162014",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Dec 22 18:25:55 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this work, we utilize dynamic dataflow/data-driven
                 techniques to improve the performance of high
                 performance computing (HPC) systems. The proposed
                 techniques are implemented and evaluated through an
                 efficient, portable, and robust programming framework
                 that enables data-driven concurrency on HPC systems.
                 The proposed framework is based on data-driven
                 multithreading (DDM), a hybrid control-flow/dataflow
                 model that schedules threads based on data availability
                 on sequential processors. The proposed framework was
                 evaluated using several benchmarks, with different
                 characteristics, on two different systems: a 4-node AMD
                 system with a total of 128 cores and a 64-node Intel
                 HPC system with a total of 768 cores. The performance
                 evaluation shows that the proposed framework scales
                 well and tolerates scheduling overheads and memory
                 latencies effectively. We also compare our framework to
                 MPI, DDM-VM, and OmpSs@Cluster. The comparison results
                 show that the proposed framework obtains comparable or
                 better performance.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Meier:2017:PVM,
  author =       "Remigius Meier and Armin Rigo and Thomas R. Gross",
  title =        "Parallel virtual machines with {RPython}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "48--59",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989233",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The RPython framework takes an interpreter for a
                 dynamic language as its input and produces a Virtual
                 Machine{\^A} (VM) for that language. RPython is being
                 used to develop PyPy, a high-performance Python
                 interpreter. However, the produced VM does not support
                 parallel execution since the framework relies on a
                 Global Interpreter Lock{\^A} (GIL): PyPy serialises the
                 execution of multi-threaded Python programs. We
                 describe the rationale and design of a new parallel
                 execution model for RPython that allows the generation
                 of parallel virtual machines while leaving the language
                 semantics unchanged. This model then allows different
                 implementations of concurrency control, and we discuss
                 an implementation based on a GIL and an implementation
                 based on Software Transactional Memory{\^A} (STM). To
                 evaluate the benefits of either choice, we adapt PyPy
                 to work with both implementations (GIL and STM). The
                 evaluation shows that PyPy with STM improves the
                 runtime of a set of multi-threaded Python programs over
                 PyPy with a GIL by factors in the range of 1.87 $
                 \times $ up to 5.96 $ \times $ when executing on a
                 processor with 8 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Nazarpour:2017:CPS,
  author =       "Hosein Nazarpour and Yli{\`e}s Falcone and Saddek
                 Bensalem and Marius Bozga",
  title =        "Concurrency-preserving and sound monitoring of
                 multi-threaded component-based systems: theory,
                 algorithms, implementation, and evaluation",
  journal =      j-FORM-ASP-COMPUT,
  volume =       "29",
  number =       "6",
  pages =        "951--986",
  month =        nov,
  year =         "2017",
  CODEN =        "FACME5",
  DOI =          "https://doi.org/10.1007/s00165-017-0422-6",
  ISSN =         "0934-5043 (print), 1433-299X (electronic)",
  ISSN-L =       "0934-5043",
  bibdate =      "Thu Nov 23 07:37:44 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/formaspcomput.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s00165-017-0422-6",
  acknowledgement = ack-nhfb,
  fjournal =     "Formal Aspects of Computing",
  journal-URL =  "http://link.springer.com/journal/165",
}

@Article{Nutaro:2017:HAA,
  author =       "James Nutaro and Bernard Zeigler",
  title =        "How to apply {Amdahl}'s law to multithreaded multicore
                 processors",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "107",
  number =       "??",
  pages =        "1--2",
  month =        sep,
  year =         "2017",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Aug 19 13:10:31 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731517300941",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Park:2017:HHC,
  author =       "Jaehyun Park and Seungcheol Baek and Hyung Gyu Lee and
                 Chrysostomos Nicopoulos and Vinson Young and Junghee
                 Lee and Jongman Kim",
  title =        "{HoPE}: Hot-Cacheline Prediction for Dynamic Early
                 Decompression in Compressed {LLCs}",
  journal =      j-TODAES,
  volume =       "22",
  number =       "3",
  pages =        "40:1--40:??",
  month =        may,
  year =         "2017",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/2999538",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Fri Jul 21 10:49:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/todaes/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/todaes.bib",
  abstract =     "Data compression plays a pivotal role in improving
                 system performance and reducing energy consumption,
                 because it increases the logical effective capacity of
                 a compressed memory system without physically
                 increasing the memory size. However, data compression
                 techniques incur some cost, such as non-negligible
                 compression and decompression overhead. This overhead
                 becomes more severe if compression is used in the
                 cache. In this article, we aim to minimize the read-hit
                 decompression penalty in compressed Last-Level Caches
                 (LLCs) by speculatively decompressing frequently used
                 cachelines. To this end, we propose a Hot-cacheline
                 Prediction and Early decompression (HoPE) mechanism
                 that consists of three synergistic techniques:
                 Hot-cacheline Prediction (HP), Early Decompression
                 (ED), and Hit-history-based Insertion (HBI). HP and HBI
                 efficiently identify the hot compressed cachelines,
                 while ED selectively decompresses hot cachelines, based
                 on their size information. Unlike previous approaches,
                 the HoPE framework considers the performance
                 balance/tradeoff between the increased effective cache
                 capacity and the decompression penalty. To evaluate the
                 effectiveness of the proposed HoPE mechanism, we run
                 extensive simulations on memory traces obtained from
                 multi-threaded benchmarks running on a full-system
                 simulation framework. We observe significant
                 performance improvements over compressed cache schemes
                 employing the conventional Least-Recently Used (LRU)
                 replacement policy, the Dynamic Re-Reference Interval
                 Prediction (DRRIP) scheme, and the Effective Capacity
                 Maximizer (ECM) compressed cache management mechanism.
                 Specifically, HoPE exhibits system performance
                 improvements of approximately 11\%, on average, over
                 LRU, 8\% over DRRIP, and 7\% over ECM by reducing the
                 read-hit decompression penalty by around 65\%, over a
                 wide range of applications.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Pathania:2017:DTM,
  author =       "Anuj Pathania and Vanchinathan Venkataramani and
                 Muhammad Shafique and Tulika Mitra and J{\"o}rg
                 Henkel",
  title =        "Defragmentation of Tasks in Many-Core Architecture",
  journal =      j-TACO,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3050437",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jul 24 18:00:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many-cores can execute multiple multithreaded tasks in
                 parallel. A task performs most efficiently when it is
                 executed over a spatially connected and compact subset
                 of cores so that performance loss due to communication
                 overhead imposed by the task's threads spread across
                 the allocated cores is minimal. Over a span of time,
                 unallocated cores can get scattered all over the
                 many-core, creating fragments in the task mapping.
                 These fragments can prevent efficient contiguous
                 mapping of incoming new tasks leading to loss of
                 performance. This problem can be alleviated by using a
                 task defragmenter, which consolidates smaller fragments
                 into larger fragments wherein the incoming tasks can be
                 efficiently executed. Optimal defragmentation of a
                 many-core is an NP-hard problem in the general case.
                 Therefore, we simplify the original problem to a
                 problem that can be solved optimally in polynomial
                 time. In this work, we introduce a concept of
                 exponentially separable mapping (ESM), which defines a
                 set of task mapping constraints on a many-core. We
                 prove that an ESM enforcing many-core can be
                 defragmented optimally in polynomial time.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pereira:2017:SBC,
  author =       "Phillipe Pereira and Higo Albuquerque and Isabela da
                 Silva and Hendrio Marques and Felipe Monteiro and
                 Ricardo Ferreira and Lucas Cordeiro",
  title =        "{SMT}-based context-bounded model checking for {CUDA}
                 programs",
  journal =      j-CCPE,
  volume =       "29",
  number =       "22",
  pages =        "??--??",
  day =          "25",
  month =        nov,
  year =         "2017",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.3934",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Dec 30 09:11:59 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@Article{Radulovic:2017:LLI,
  author =       "Milan B. Radulovi{\'c} and Sylvain Girbal and Milo V.
                 Tomasevi{\'c}",
  title =        "Low-level implementation of the {SISC} protocol for
                 thread-level speculation on a multi-core architecture",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "67",
  number =       "??",
  pages =        "1--19",
  month =        sep,
  year =         "2017",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Wed Aug 9 14:49:25 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bibo",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819117300972",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Reiche:2017:AVI,
  author =       "Oliver Reiche and Christof Kobylko and Frank Hannig
                 and J{\"u}rgen Teich",
  title =        "Auto-vectorization for image processing {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "21--30",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081039",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The parallelization of programs and distributing their
                 workloads to multiple threads can be a challenging
                 task. In addition to multi-threading, harnessing vector
                 units in CPUs proves highly desirable. However,
                 employing vector units to speed up programs can be
                 quite tedious. Either a program developer solely relies
                 on the auto-vectorization capabilities of the compiler
                 or he manually applies vector intrinsics, which is
                 extremely error-prone, difficult to maintain, and not
                 portable at all. Based on whole-function vectorization,
                 a method to replace control flow with data flow, we
                 propose auto-vectorization techniques for image
                 processing DSLs in the context of source-to-source
                 compilation. The approach does not require the input to
                 be available in SSA form. Moreover, we formulate
                 constraints under which the vectorization analysis and
                 code transformations may be greatly simplified in the
                 context of image processing DSLs. As part of our
                 methodology, we present control flow to data flow
                 transformation as a source-to-source translation.
                 Moreover, we propose a method to efficiently analyze
                 algorithms with mixed bit-width data types to determine
                 the optimal SIMD width, independently of the target
                 instruction set. The techniques are integrated into an
                 open source DSL framework. Subsequently, the
                 vectorization capabilities are compared to a variety of
                 existing state-of-the-art C/C++ compilers. A geometric
                 mean speedup of up to 3.14 is observed for benchmarks
                 taken from ISPC and image processing, compared to
                 non-vectorized executions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Saarikivi:2017:MTS,
  author =       "Olli Saarikivi and Hern{\'a}n Ponce-De-Le{\'o}n and
                 Kari K{\"a}hk{\"o}nen and Keijo Heljanko and Javier
                 Esparza",
  title =        "Minimizing Test Suites with Unfoldings of
                 Multithreaded Programs",
  journal =      j-TECS,
  volume =       "16",
  number =       "2",
  pages =        "45:1--45:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3012281",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Mon Jul 24 09:51:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "This article focuses on computing minimal test suites
                 for multithreaded programs. Based on previous work on
                 test case generation for multithreaded programs using
                 unfoldings, this article shows how this unfolding can
                 be used to generate minimal test suites covering all
                 local states of the program. Generating such minimal
                 test suites is shown to be NP-complete in the size of
                 the unfolding. We propose an SMT encoding for this
                 problem and two methods based on heuristics which only
                 approximate the solution, but scale better in practice.
                 Finally, we apply our methods to compute the minimal
                 test suites for several benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Sanderson:2017:PGP,
  author =       "Conrad Sanderson and Ryan Curtin",
  title =        "\pkg{gmm\_diag} and \pkg{gmm\_full}: {C++} classes for
                 multi-threaded {Gaussian} mixture models and
                 Expectation-Maximisation",
  journal =      j-J-OPEN-SOURCE-SOFT,
  volume =       "2",
  number =       "18",
  pages =        "365:1--365:2",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.21105/joss.00365",
  ISSN =         "2475-9066",
  ISSN-L =       "2475-9066",
  bibdate =      "Thu Sep 13 08:09:35 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/joss.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://joss.theoj.org/papers/10.21105/joss.00365",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Open Source Software",
  journal-URL =  "http://joss.theoj.org/;
                 https://github.com/openjournals/joss-papers/",
  onlinedate =   "16 October 2017",
  ORCID-numbers = "Conrad Sanderson / 0000-0002-0049-4501; Ryan Curtin /
                 0000-0002-9903-8214",
}

@Article{Schafer:2017:PHL,
  author =       "Benjamin Carrion Schafer",
  title =        "Parallel High-Level Synthesis Design Space Exploration
                 for Behavioral {IPs} of Exact Latencies",
  journal =      j-TODAES,
  volume =       "22",
  number =       "4",
  pages =        "65:1--65:??",
  month =        jul,
  year =         "2017",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/3041219",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Mon Jan 22 09:03:32 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/todaes.bib",
  abstract =     "This works presents a Design Space Exploration (DSE)
                 method for Behavioral IPs (BIPs) given in ANSI-C or
                 SystemC to find the smallest micro-architecture for a
                 specific target latency. Previous work on High-Level
                 Synthesis (HLS) DSE mainly focused on finding a
                 tradeoff curve with Pareto-optimal designs. HLS is,
                 however, a single process (component) synthesis method.
                 Very often, the latency of the components requires a
                 specific fixed latency when inserted within a larger
                 system. This work presents a fast multi-threaded method
                 to find the smallest micro-architecture for a given BIP
                 and target latency by discriminating between all
                 different exploration knobs and exploring these
                 concurrently. Experimental results show that our
                 proposed method is very effective and comprehensive
                 results compare the quality of results vs. the speedup
                 of your proposed explorer.",
  acknowledgement = ack-nhfb,
  articleno =    "65",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Tian:2017:RSP,
  author =       "Z. Tian and T. Liu and Q. Zheng and E. Zhuang and M.
                 Fan and Z. Yang",
  title =        "Reviving Sequential Program Birthmarking for
                 Multithreaded Software Plagiarism Detection",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "PP",
  number =       "99",
  pages =        "1--1",
  month =        "????",
  year =         "2017",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2017.2688383",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Feb 1 19:49:24 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7888597",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Article{Turakhia:2017:TPE,
  author =       "Yatish Turakhia and Guangshuo Liu and Siddharth Garg
                 and Diana Marculescu",
  title =        "Thread Progress Equalization: Dynamically Adaptive
                 Power-Constrained Performance Optimization of
                 Multi-Threaded Applications",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "66",
  number =       "4",
  pages =        "731--744",
  month =        "????",
  year =         "2017",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2016.2608951",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Sat Mar 11 14:24:09 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Utterback:2017:POR,
  author =       "Robert Utterback and Kunal Agrawal and I-Ting Angelina
                 Lee and Milind Kulkarni",
  title =        "Processor-Oblivious Record and Replay",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "145--161",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Record-and-replay systems are useful tools for
                 debugging non-deterministic parallel programs by first
                 recording an execution and then replaying that
                 execution to produce the same access pattern. Existing
                 record-and-replay systems generally target thread-based
                 execution models, and record the behaviors and
                 interleavings of individual threads. Dynamic
                 multithreaded languages and libraries, such as the Cilk
                 family, OpenMP, TBB, etc., do not have a notion of
                 threads. Instead, these languages provide a
                 processor-oblivious model of programming, where
                 programs expose task-parallelism using high-level
                 constructs such as spawn/sync without regard to the
                 number of threads/cores available to run the program.
                 Thread-based record-and-replay would violate the
                 processor-oblivious nature of these programs, as they
                 incorporate the number of threads into the recorded
                 information, constraining the replayed execution to the
                 same number of threads. In this paper, we present a
                 processor-oblivious record-and-replay scheme for such
                 languages where record and replay can use different
                 number of processors and both are scheduled using work
                 stealing. We provide theoretical guarantees for our
                 record and replay scheme --- namely that record is
                 optimal for programs with one lock and replay is
                 near-optimal for all cases. In addition, we implemented
                 this scheme in the Cilk Plus runtime system and our
                 evaluation indicates that processor-obliviousness does
                 not cause substantial overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Wang:2017:JRJ,
  author =       "Kaiyuan Wang and Sarfraz Khurshid and Milos Gligoric",
  title =        "{JPR}: Replaying {JPF} Traces Using Standard {JVM}",
  journal =      j-SIGSOFT,
  volume =       "42",
  number =       "4",
  pages =        "1--5",
  month =        oct,
  year =         "2017",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/3149485.3149494",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Aug 1 17:16:48 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib",
  abstract =     "Java PathFinder (JPF) is a backtrackable Java Virtual
                 Machine (JVM), which is implemented in Java and runs on
                 a standard JVM (e.g., Oracle HotSpot). Thus, a JPF
                 developer can use off-the-shelf Java debuggers (e.g.,
                 jdb) when debugging code that makes up JPF. JPF
                 explores all non-deterministic executions of a given
                 target program and monitors for property violations. To
                 facilitate debugging of the target program, JPF can
                 capture and replay the execution trace that leads to a
                 property violation. While the deterministic replay is
                 invaluable, the replay with JPF does not allow the
                 developer to attach an off-the-shelf Java debugger to
                 the target program (e.g., step through the application
                 code, set breakpoints, etc.). We present a technique,
                 dubbed JPR, to improve the debugging experience of the
                 JPF captured traces by migrating the JPF traces to a
                 new format that can be executed using the standard JVM.
                 JPR annotates each JPF trace, during the capture phase,
                 with extra data (e.g., instruction index, instruction
                 count, etc.); the annotated trace is then used to
                 instrument Java bytecode to enforce the same execution
                 trace on a standard JVM. JPR is compatible with various
                 optimizations, e.g., state matching and partial-order
                 reduction. We evaluated JPR on all multithreaded Java
                 programs in the official JPF distribution. Our results
                 show that JPR successfully replayed all JPF traces on
                 the standard JVM with reasonable overhead during both
                 recording and replaying.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/citation.cfm?id=J728",
}

@Article{Yeh:2017:PFG,
  author =       "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
                 Rudolf Eigenmann and Timothy G. Rogers",
  title =        "{Pagoda}: Fine-Grained {GPU} Resource Virtualization
                 for Narrow Tasks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "221--234",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Massively multithreaded GPUs achieve high throughput
                 by running thousands of threads in parallel. To fully
                 utilize the hardware, workloads spawn work to the GPU
                 in bulk by launching large tasks, where each task is a
                 kernel that contains thousands of threads that occupy
                 the entire GPU. GPUs face severe underutilization and
                 their performance benefits vanish if the tasks are
                 narrow, i.e., they contain {$<$} 500 threads.
                 Latency-sensitive applications in network, signal, and
                 image processing that generate a large number of tasks
                 with relatively small inputs are examples of such
                 limited parallelism. This paper presents Pagoda, a
                 runtime system that virtualizes GPU resources, using an
                 OS-like daemon kernel called MasterKernel. Tasks are
                 spawned from the CPU onto Pagoda as they become
                 available, and are scheduled by the MasterKernel at the
                 warp granularity. Experimental results demonstrate that
                 Pagoda achieves a geometric mean speedup of 5.70x over
                 PThreads running on a 20-core CPU, 1.51x over
                 CUDA-HyperQ, and 1.69x over GeMTC, the state-of-
                 the-art runtime GPU task scheduling system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Adams:2018:TTV,
  author =       "Joel C. Adams and Patrick A. Crain and Christopher P.
                 Dilley and Christiaan D. Hazlett and Elizabeth R.
                 Koning and Serita M. Nelesen and Javin B. Unger and
                 Mark B. Vande Stel",
  title =        "{TSGL}: A tool for visualizing multithreaded
                 behavior",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "118 (part 1)",
  number =       "??",
  pages =        "233--246",
  month =        aug,
  year =         "2018",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2018.02.025",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat May 12 16:27:31 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.sciencedirect.com/science/article/pii/S0743731518301035",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{AlBarakat:2018:MFM,
  author =       "Laith M. AlBarakat and Paul {Gratz, V} and Daniel A.
                 Jimenez",
  title =        "{MTB-Fetch}: Multithreading Aware Hardware Prefetching
                 for Chip Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "175--178",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2847345",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To fully exploit the scaling performance in Chip
                 Multiprocessors, applications must be divided into
                 semi-independent processes that can run concurrently on
                 multiple cores within a system. One major class of such
                 applications, shared-memory, multi-threaded
                 applications, requires programmers insert thread
                 synchronization primitives (i.e., locks, barriers, and
                 condition variables) in their critical sections to
                 synchronize data access between processes. For this
                 class of applications, scaling performance requires
                 balanced per-thread workloads with little time spent in
                 critical sections. In practice, however, threads often
                 waste significant time waiting to acquire
                 locks/barriers in their critical sections, leading to
                 thread imbalance and poor performance scaling.
                 Moreover, critical sections often stall data
                 prefetchers that mitigate the effects of long critical
                 section stalls by ensuring data is preloaded in the
                 core caches when the critical section is complete. In
                 this paper we examine a pure hardware technique to
                 enable safe data prefetching beyond synchronization
                 points in CMPs. We show that successful prefetching
                 beyond synchronization points requires overcoming two
                 significant challenges in existing prefetching
                 techniques. First, we find that typical data
                 prefetchers are designed to trigger prefetches based on
                 current misses. This approach this works well for
                 traditional, continuously executing, single-threaded
                 applications. However, when a thread stalls on a
                 synchronization point, it typically does not produce
                 any new memory references to trigger a prefetcher.
                 Second, even in the event that a prefetch were to be
                 correctly directed to read beyond a synchronization
                 point, it will likely prefetch shared data from another
                 core before this data has been written. While this
                 prefetch would be considered ``{accurate''} it is
                 highly undesirable, because such a prefetch would lead
                 to three extra ``ping-{pong''} movements back and forth
                 between private caches in the producing and consuming
                 cores, incurring more latency and energy overhead than
                 without prefetching. We develop a new data prefetcher,
                 Multi-Thread B-Fetch (MTBFetch), built as an extension
                 to a previous single-threaded data prefetcher. MTBFetch
                 addresses both issues in prefetching for shared memory
                 multi-threaded workloads. MTB-Fetch achieves a speedup
                 of 9.3 percent for multi-threaded applications with
                 little additional hardware.",
  acknowledgement = ack-nhfb,
  affiliation =  "AlBarakat, LM (Reprint Author), Texas A\&M Univ, Dept
                 Elect \& Comp Engn, College Stn, TX 77843 USA.
                 AlBarakat, Laith M.; Gratz, Paul, V, Texas A\&M Univ,
                 Dept Elect \& Comp Engn, College Stn, TX 77843 USA.
                 Jimenez, Daniel A., Texas A\&M Univ, Dept Comp Sci \&
                 Engn, College Stn, TX 77843 USA.",
  author-email = "lalbarakat@tamu.edu pgratz@tamu.edu
                 djimenez@cse.tamu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation
                 [I/UCRC-1439722, CCF-1649242, CCF-1216604/1332598];
                 Intel Corp.",
  funding-text = "We thank the National Science Foundation, which
                 partially supported this work through grants
                 I/UCRC-1439722, CCF-1649242 and CCF-1216604/1332598 and
                 Intel Corp. for their generous support.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Chip multiprocessor; hardware prefetching;
                 multi-threading; shared memory",
  keywords-plus = "PROCESSORS",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "AlBarakat:2018:MFM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Amer:2018:LCM,
  author =       "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and
                 Milind Chabbi and Yanjie Wei and Jeff Hammond and
                 Satoshi Matsuoka",
  title =        "Lock Contention Management in Multithreaded {MPI}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3275443",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3275443",
  abstract =     "In this article, we investigate contention management
                 in lock-based thread-safe MPI libraries. Specifically,
                 we make two assumptions: (1) locks are the only form of
                 synchronization when protecting communication paths;
                 and (2) contention occurs, and thus serialization is
                 unavoidable. Our work distinguishes between lock
                 acquisitions with respect to work being performed
                 inside a critical section; productive vs. unproductive.
                 Waiting for message reception without doing anything
                 else inside a critical section is an example of
                 unproductive lock acquisition. We show that the
                 high-throughput nature of modern scalable locking
                 protocols translates into better communication progress
                 for throughput-intensive MPI communication but
                 negatively impacts latency-sensitive communication
                 because of overzealous unproductive lock acquisition.
                 To reduce unproductive lock acquisitions, we devised a
                 method that promotes threads with productive work using
                 a generic two-level priority locking protocol. Our
                 results show that using a high-throughput protocol for
                 productive work and a fair protocol for less productive
                 code paths ensures the best tradeoff for fine-grained
                 communication, whereas a fair protocol is sufficient
                 for more coarse-grained communication. Although these
                 efforts have been rewarding, scalability degradation
                 remains significant. We discuss techniques that diverge
                 from the pure locking model and offer the potential to
                 further improve scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chen:2018:ESE,
  author =       "Kuan-Chung Chen and Chung-Ho Chen",
  title =        "Enabling {SIMT} Execution Model on Homogeneous
                 Multi-Core System",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177960",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-instruction multiple-thread (SIMT) machine
                 emerges as a primary computing device in
                 high-performance computing, since the SIMT execution
                 paradigm can exploit data-level parallelism
                 effectively. This article explores the SIMT execution
                 potential on homogeneous multi-core processors, which
                 generally run in multiple-instruction multiple-data
                 (MIMD) mode when utilizing the multi-core resources. We
                 address three architecture issues in enabling SIMT
                 execution model on multi-core processor, including
                 multithreading execution model, kernel thread context
                 placement, and thread divergence. For the SIMT
                 execution model, we propose a fine-grained
                 multithreading mechanism on an ARM-based multi-core
                 system. Each of the processor cores stores the kernel
                 thread contexts in its L1 data cache for per-cycle
                 thread-switching requirement. For divergence-intensive
                 kernels, an Inner Conditional Statement First
                 (ICS-First) mechanism helps early re-convergence to
                 occur and significantly improves the performance. The
                 experiment results show that effectiveness in
                 data-parallel processing reduces on average 36\%
                 dynamic instructions, and boosts the SIMT executions to
                 achieve on average 1.52$ \times $ and up to 5$ \times $
                 speedups over the MIMD counterpart for OpenCL
                 benchmarks for single issue in-order processor cores.
                 By using the explicit vectorization optimization on the
                 kernels, the SIMT model gains further benefits from the
                 SIMD extension and achieves 1.71$ \times $ speedup over
                 the MIMD approach. The SIMT model using in-order
                 superscalar processor cores outperforms the MIMD model
                 that uses superscalar out-of-order processor cores by
                 40\%. The results show that, to exploit data-level
                 parallelism, enabling the SIMT model on homogeneous
                 multi-core processors is important.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2018:ROM,
  author =       "Kuan-Hsun Chen and Georg von der Br{\"u}ggen and
                 Jian-Jia Chen",
  title =        "Reliability Optimization on Multi-Core Systems with
                 Multi-Tasking and Redundant Multi-Threading",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "67",
  number =       "4",
  pages =        "484--497",
  month =        "????",
  year =         "2018",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2017.2769044",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Mar 15 08:52:31 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://ieeexplore.ieee.org/document/8094023/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Chin:2018:EAN,
  author =       "Wei-Sheng Chin and Bo-Wen Yuan and Meng-Yuan Yang and
                 Chih-Jen Lin",
  title =        "An Efficient Alternating {Newton} Method for Learning
                 Factorization Machines",
  journal =      j-TIST,
  volume =       "9",
  number =       "6",
  pages =        "72:1--72:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230710",
  ISSN =         "2157-6904 (print), 2157-6912 (electronic)",
  ISSN-L =       "2157-6904",
  bibdate =      "Thu Nov 15 16:23:08 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tist.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3230710",
  abstract =     "To date, factorization machines (FMs) have emerged as
                 a powerful model in many applications. In this work, we
                 study the training of FM with the logistic loss for
                 binary classification, which is a nonlinear extension
                 of the linear model with the logistic loss (i.e.,
                 logistic regression). For the training of large-scale
                 logistic regression, Newton methods have been shown to
                 be an effective approach, but it is difficult to apply
                 such methods to FM because of the nonconvexity. We
                 consider a modification of FM that is multiblock convex
                 and propose an alternating minimization algorithm based
                 on Newton methods. Some novel optimization techniques
                 are introduced to reduce the running time. Our
                 experiments demonstrate that the proposed algorithm is
                 more efficient than stochastic gradient algorithms and
                 coordinate descent methods. The parallelism of our
                 method is also investigated for the acceleration in
                 multithreading environments.",
  acknowledgement = ack-nhfb,
  articleno =    "72",
  fjournal =     "ACM Transactions on Intelligent Systems and Technology
                 (TIST)",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1318",
}

@Article{Deiana:2018:UPN,
  author =       "Enrico A. Deiana and Vincent St-Amour and Peter A.
                 Dinda and Nikos Hardavellas and Simone Campanoni",
  title =        "Unconventional Parallelization of Nondeterministic
                 Applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "432--447",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173181",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The demand for thread-level-parallelism (TLP) on
                 commodity processors is endless as it is essential for
                 gaining performance and saving energy. However, TLP in
                 today's programs is limited by dependences that must be
                 satisfied at run time. We have found that for
                 nondeterministic programs, some of these actual
                 dependences can be satisfied with alternative data that
                 can be generated in parallel, thus boosting the
                 program's TLP. Satisfying these dependences with
                 alternative data nonetheless produces final outputs
                 that match those of the original nondeterministic
                 program. To demonstrate the practicality of our
                 technique, we describe the design, implementation, and
                 evaluation of our compilers, autotuner, profiler, and
                 runtime, which are enabled by our proposed C++
                 programming language extensions. The resulting system
                 boosts the performance of six well-known
                 nondeterministic and multi-threaded benchmarks by
                 158.2\% (geometric mean) on a 28-core Intel-based
                 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{DeLozier:2018:SSO,
  author =       "Christian DeLozier and Ariel Eizenberg and Brandon
                 Lucia and Joseph Devietti",
  title =        "{SOFRITAS}: Serializable Ordering-Free Regions for
                 Increasing Thread Atomicity Scalably",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "286--300",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173192",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Correctly synchronizing multithreaded programs is
                 challenging and errors can lead to program failures
                 such as atomicity violations. Existing strong memory
                 consistency models rule out some possible failures, but
                 are limited by depending on programmer-defined locking
                 code. We present the new Ordering-Free Region (OFR)
                 serializability consistency model that ensures
                 atomicity for OFRs, which are spans of dynamic
                 instructions between consecutive ordering constructs
                 (e.g., barriers), without breaking atomicity at lock
                 operations. Our platform, Serializable Ordering-Free
                 Regions for Increasing Thread Atomicity Scalably
                 (SOFRITAS), ensures a C/C++ program's execution is
                 equivalent to a serialization of OFRs by default. We
                 build two systems that realize the SOFRITAS idea: a
                 concurrency bug finding tool for testing called
                 SOFRITEST, and a production runtime system called
                 SOPRO. SOFRITEST uses OFRs to find concurrency bugs,
                 including a multi-critical-section atomicity violation
                 in memcached that weaker consistency models will miss.
                 If OFR's are too coarse-grained, SOFRITEST suggests
                 refinement annotations automatically. Our software-only
                 SOPRO implementation has high performance, scales well
                 with increased parallelism, and prevents failures
                 despite bugs in locking code. SOFRITAS has an average
                 overhead of just 1.59x on a single-threaded execution
                 and 1.51x on sixteen threads, despite pthreads' much
                 weaker memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{DePestel:2018:RRP,
  author =       "Sander {De Pestel} and Sam {Van den Steen} and Shoaib
                 Akram and Lieven Eeckhout",
  title =        "{RPPM}: Rapid Performance Prediction of Multithreaded
                 Applications on Multicore Hardware",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "183--186",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2849983",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper proposes RPPM which, based on a
                 microarchitecture-independent profile of a
                 multithreaded application, predicts its performance on
                 a previously unseen multicore platform. RPPM breaks up
                 multithreaded program execution into epochs based on
                 synchronization primitives, and then predicts per-epoch
                 active execution times for each thread and
                 synchronization overhead to arrive at a prediction for
                 overall application performance. RPPM predicts
                 performance within 12 percent on average (27 percent
                 max error) compared to cycle-level simulation. We
                 present a case study to illustrate that RPPM can be
                 used for making accurate multicore design trade-offs
                 early in the design cycle.",
  acknowledgement = ack-nhfb,
  affiliation =  "De Pestel, S (Reprint Author), Univ Ghent, B-9000
                 Ghent, Belgium. De Pestel, Sander; Van den Steen, Sam;
                 Akram, Shoaib; Eeckhout, Lieven, Univ Ghent, B-9000
                 Ghent, Belgium.",
  author-email = "sander.depestel@ugent.be sam.vandensteen@ugent.be
                 shoaib.akram@ugent.be lieven.eeckhout@ugent.be",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Agency for Innovation by Science and
                 Technology in Flanders (IWT); European Research Council
                 (ERC) [741097]",
  funding-text = "Sander De Pestel is supported through a doctoral
                 fellowship by the Agency for Innovation by Science and
                 Technology in Flanders (IWT). Additional support is
                 provided through the European Research Council (ERC)
                 Advanced Grant agreement no. 741097.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "micro-architecture; Modeling; multi-threaded;
                 performance",
  number-of-cited-references = "12",
  ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Pestel:2018:RRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Deveci:2018:MSM,
  author =       "Mehmet Deveci and Christian Trott and Sivasankaran
                 Rajamanickam",
  title =        "Multithreaded sparse matrix--matrix multiplication for
                 many-core and {GPU} architectures",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "78",
  number =       "??",
  pages =        "33--46",
  month =        oct,
  year =         "2018",
  CODEN =        "PACOEJ",
  DOI =          "https://doi.org/10.1016/j.parco.2018.06.009",
  ISSN =         "0167-8191 (print), 1872-7336 (electronic)",
  ISSN-L =       "0167-8191",
  bibdate =      "Mon Jan 7 15:25:20 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167819118301923",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/01678191",
}

@Article{Ding:2018:IOC,
  author =       "Bailu Ding and Lucja Kot and Johannes Gehrke",
  title =        "Improving optimistic concurrency control through
                 transaction batching and operation reordering",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "12",
  number =       "2",
  pages =        "169--182",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3282495.3282502",
  ISSN =         "2150-8097",
  bibdate =      "Wed Jan 2 18:29:48 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "OLTP systems can often improve throughput by batching
                 transactions and processing them as a group. Batching
                 has been used for optimizations such as message packing
                 and group commits; however, there is little research on
                 the benefits of a holistic approach to batching across
                 a transaction's entire life cycle. In this paper, we
                 present a framework to incorporate batching at multiple
                 stages of transaction execution for OLTP systems based
                 on optimistic concurrency control. Storage batching
                 enables reordering of transaction reads and writes at
                 the storage layer, reducing conflicts on the same
                 object. Validator batching enables reordering of
                 transactions before validation, reducing conflicts
                 between transactions. Dependencies between transactions
                 make transaction reordering a non-trivial problem, and
                 we propose several efficient and practical algorithms
                 that can be customized to various transaction
                 precedence policies such as reducing tail latency. We
                 also show how to reorder transactions with a
                 thread-aware policy in multi-threaded OLTP architecture
                 without a centralized validator. In-depth experiments
                 on a research prototype, an opensource OLTP system, and
                 a production OLTP system show that our techniques
                 increase transaction throughput by up to 2.2x and
                 reduce their tail latency by up to 71\% compared with
                 the start-of-the-art systems on workloads with high
                 data contention.",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Drechsler:2018:TSR,
  author =       "Joscha Drechsler and Ragnar Mogk and Guido Salvaneschi
                 and Mira Mezini",
  title =        "Thread-safe reactive programming",
  journal =      j-PACMPL,
  volume =       "2",
  number =       "OOPSLA",
  pages =        "107:1--107:30",
  month =        oct,
  year =         "2018",
  DOI =          "https://doi.org/10.1145/3276477",
  bibdate =      "Sat Aug 8 07:56:30 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3276477",
  abstract =     "The execution of an application written in a reactive
                 language involves transfer of data and control flow
                 between imperative and reactive abstractions at
                 well-defined points. In a multi-threaded environment,
                 multiple such interactions may execute \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "107",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Fix:2018:HMT,
  author =       "Jordan Fix and Nayana P. Nagendra and Sotiris
                 Apostolakis and Hansen Zhang and Sophie Qiu and David
                 I. August",
  title =        "Hardware Multithreaded Transactions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "15--29",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173172",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Speculation with transactional memory systems helps
                 programmers and compilers produce profitable
                 thread-level parallel programs. Prior work shows that
                 supporting transactions that can span multiple threads,
                 rather than requiring transactions be contained within
                 a single thread, enables new types of speculative
                 parallelization techniques for both programmers and
                 parallelizing compilers. Unfortunately, software
                 support for multi-threaded transactions (MTXs) comes
                 with significant additional inter-thread communication
                 overhead for speculation validation. This overhead can
                 make otherwise good parallelization unprofitable for
                 programs with sizeable read and write sets. Some
                 programs using these prior software MTXs overcame this
                 problem through significant efforts by expert
                 programmers to minimize these sets and optimize
                 communication, capabilities which compiler technology
                 has been unable to equivalently achieve. Instead, this
                 paper makes speculative parallelization less laborious
                 and more feasible through low-overhead speculation
                 validation, presenting the first complete design,
                 implementation, and evaluation of hardware MTXs. Even
                 with maximal speculation validation of every load and
                 store inside transactions of tens to hundreds of
                 millions of instructions, profitable parallelization of
                 complex programs can be achieved. Across 8 benchmarks,
                 this system achieves a geomean speedup of 99\% over
                 sequential execution on a multicore machine with 4
                 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Forsell:2018:RMM,
  author =       "Martti Forsell and Jussi Roivainen and Ville
                 Lepp{\"a}nen",
  title =        "{REPLICA MBTAC}: multithreaded dual-mode processor",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "74",
  number =       "5",
  pages =        "1911--1933",
  month =        may,
  year =         "2018",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2199-z",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:12 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/74/5;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Gerbessiotis:2018:SIS,
  author =       "Alexandros V. Gerbessiotis",
  title =        "A Study of Integer Sorting on Multicores",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "28",
  number =       "04",
  pages =        "??--??",
  month =        dec,
  year =         "2018",
  DOI =          "https://doi.org/10.1142/S0129626418500147",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:05 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626418500147",
  abstract =     "Integer sorting on multicores and GPUs can be realized
                 by a variety of approaches that include variants of
                 distribution-based methods such as radix-sort,
                 comparison-oriented algorithms such as deterministic
                 regular sampling and random sampling parallel sorting,
                 and network-based algorithms such as Batcher's bitonic
                 sorting algorithm. In this work we present an
                 experimental study of integer sorting on multicore
                 processors. We have implemented serial and parallel
                 radix-sort for various radixes, deterministic regular
                 oversampling, and random oversampling parallel sorting,
                 including new variants of ours, and also some
                 previously little explored or unexplored variants of
                 bitonic-sort and odd-even transposition sort. The study
                 uses multithreading and multiprocessing parallel
                 programming libraries with the same C language code
                 working under Open MPI, MulticoreBSP, and BSPlib. We
                 first provide some general high-level observations on
                 the performance of these implementations. If we can
                 conclude anything is that accurate prediction of
                 performance by taking into consideration architecture
                 dependent features such as the structure and
                 characteristics of multiple memory hierarchies is
                 difficult and more often than not untenable. To some
                 degree this is affected by the overhead imposed by the
                 high-level library used in the programming effort.
                 Another objective is to model the performance of these
                 algorithms and their implementations under the MBSP
                 (Multi-memory BSP) model. Despite the limitations
                 mentioned above, we can still draw some reliable
                 conclusions and reason about the performance of these
                 implementations using the MBSP model, thus making MBSP
                 useful and usable.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Gu:2018:CCA,
  author =       "Ronghui Gu and Zhong Shao and Jieung Kim and Xiongnan
                 (Newman) Wu and J{\'e}r{\'e}mie Koenig and Vilhelm
                 Sj{\"o}berg and Hao Chen and David Costanzo and Tahina
                 Ramananandro",
  title =        "Certified concurrent abstraction layers",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "646--661",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192381",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent abstraction layers are ubiquitous in modern
                 computer systems because of the pervasiveness of
                 multithreaded programming and multicore hardware.
                 Abstraction layers are used to hide the implementation
                 details (e.g., fine-grained synchronization) and reduce
                 the complex dependencies among components at different
                 levels of abstraction. Despite their obvious
                 importance, concurrent abstraction layers have not been
                 treated formally. This severely limits the
                 applicability of layer-based techniques and makes it
                 difficult to scale verification across multiple
                 concurrent layers. In this paper, we present CCAL---a
                 fully mechanized programming toolkit developed under
                 the CertiKOS project---for specifying, composing,
                 compiling, and linking certified concurrent abstraction
                 layers. CCAL consists of three technical novelties: a
                 new game-theoretical, strategy-based compositional
                 semantic model for concurrency (and its associated
                 program verifiers), a set of formal linking theorems
                 for composing multithreaded and multicore concurrent
                 layers, and a new CompCertX compiler that supports
                 certified thread-safe compilation and linking. The CCAL
                 toolkit is implemented in Coq and supports layered
                 concurrent programming in both C and assembly. It has
                 been successfully applied to build a fully certified
                 concurrent OS kernel with fine-grained locking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '18 proceedings.",
}

@Article{Hukerikar:2018:RIA,
  author =       "Saurabh Hukerikar and Keita Teranishi and Pedro C.
                 Diniz and Robert F. Lucas",
  title =        "{RedThreads}: An Interface for Application-Level Fault
                 Detection\slash Correction Through Adaptive Redundant
                 Multithreading",
  journal =      j-INT-J-PARALLEL-PROG,
  volume =       "46",
  number =       "2",
  pages =        "225--251",
  month =        apr,
  year =         "2018",
  CODEN =        "IJPPE5",
  DOI =          "https://doi.org/10.1007/s10766-017-0492-3",
  ISSN =         "0885-7458 (print), 1573-7640 (electronic)",
  ISSN-L =       "0885-7458",
  bibdate =      "Fri Oct 11 08:37:50 MDT 2019",
  bibsource =    "http://link.springer.com/journal/10766/46/2;
                 https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Parallel Programming",
  journal-URL =  "http://link.springer.com/journal/10766",
}

@Article{Iliakis:2018:DMS,
  author =       "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
                 Soudris",
  title =        "Decoupled {MapReduce} for Shared-Memory Multi-Core
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "143--146",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2827929",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Modern multi-core processors exhibit high integration
                 densities, e.g., up to several tens of cores. Multiple
                 programming frameworks have emerged to facilitate the
                 development of highly parallel applications. The
                 MapReduce programming model, after having demonstrated
                 its usability in the area of distributed computing
                 systems, has been adapted to the needs of shared-memory
                 multi-processors showing promising results in
                 comparison with conventional multi-threaded libraries,
                 e.g., pthreads. In this paper we enhance the
                 traditional MapReduce architecture by decoupling the
                 map and combine phases in order to boost parallel
                 execution. We show that combiners' memory intensive
                 features limit the system's degree of parallelism, thus
                 resulting in sub-optimal hardware utilization, leaving
                 space for further performance improvements. The
                 proposed decoupled MapReduce architecture is evaluated
                 into a NUMA server platform, showing that the adoption
                 of the De-MapR runtime enables more efficient hardware
                 utilization and competent run-time improvements. We
                 demonstrate that the proposed solution achieves
                 execution speedups of up to 2.46x compared to a
                 state-of-the-art, shared-memory MapReduce library.",
  acknowledgement = ack-nhfb,
  affiliation =  "Iliakis, K (Reprint Author), Natl Tech Univ Athens,
                 Zografos 15780, Greece. Iliakis, Konstantinos; Xydis,
                 Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens,
                 Zografos 15780, Greece.",
  author-email = "konstantinos.iliakis@cern.ch sxydis@microlab.ntua.gr
                 dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "MapReduce; multi-cores; runtime systems",
  number-of-cited-references = "13",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
  times-cited =  "0",
  unique-id =    "Iliakis:2018:DMS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jacobs:2018:MTV,
  author =       "Bart Jacobs and Dragan Bosnacki and Ruurd Kuiper",
  title =        "Modular Termination Verification of Single-Threaded
                 and Multithreaded Programs",
  journal =      j-TOPLAS,
  volume =       "40",
  number =       "3",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2018",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/3210258",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Thu Oct 18 12:01:50 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  abstract =     "We propose an approach for the modular specification
                 and verification of total correctness properties of
                 object-oriented programs. The core of our approach is a
                 specification style that prescribes a way to assign a
                 level expression to each method such that each callee's
                 level is below the caller's, even in the presence of
                 dynamic binding. The specification style yields
                 specifications that properly hide implementation
                 details. The main idea is to use multisets of method
                 names as levels, and to associate with each object
                 levels that abstractly reflect the way the object is
                 built from other objects. A method's level is then
                 defined in terms of the method's own name and the
                 levels associated with the objects passed as arguments.
                 We first present the specification style in the context
                 of programs that do not modify object fields. We then
                 combine it with separation logic and abstract predicate
                 families to obtain an approach for programs with heap
                 mutation. In a third step, we address concurrency, by
                 incorporating an existing approach for verifying
                 deadlock freedom of channels and locks. Our main
                 contribution here is to achieve information hiding by
                 using the proposed termination levels for lock ordering
                 as well. Also, we introduce call permissions to enable
                 elegant verification of termination of programs where
                 threads cause work in other threads, such as in thread
                 pools or fine-grained concurrent algorithms involving
                 compare-and-swap loops. We explain how our approach can
                 be used also to verify the liveness of nonterminating
                 programs.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Kahkonen:2018:TPC,
  author =       "Kari K{\"a}hk{\"o}nen and Keijo Heljanko",
  title =        "Testing Programs with Contextual Unfoldings",
  journal =      j-TECS,
  volume =       "17",
  number =       "1",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2810000",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:34 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  abstract =     "In this article, we present a new algorithm that
                 combines contextual unfoldings and dynamic symbolic
                 execution to systematically test multithreaded
                 programs. The approach uses symbolic execution to limit
                 the number of input values and unfoldings to thus limit
                 the number of thread interleavings that are needed to
                 cover reachable local states of threads in the program
                 under test. We show that the use of contextual
                 unfoldings allows interleavings of threads to be
                 succinctly represented. This can in some cases lead to
                 a substantial reduction in the number of needed test
                 executions when compared to previous approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Kislal:2018:ECC,
  author =       "Orhan Kislal and Jagadish Kotra and Xulong Tang and
                 Mahmut Taylan Kandemir and Myoungsoo Jung",
  title =        "Enhancing computation-to-core assignment with physical
                 location information",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "312--327",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Going beyond a certain number of cores in modern
                 architectures requires an on-chip network more scalable
                 than conventional buses. However, employing an on-chip
                 network in a manycore system (to improve scalability)
                 makes the latencies of the data accesses issued by a
                 core non-uniform. This non-uniformity can play a
                 significant role in shaping the overall application
                 performance. This work presents a novel compiler
                 strategy which involves exposing architecture
                 information to the compiler to enable an optimized
                 computation-to-core mapping. Specifically, we propose a
                 compiler-guided scheme that takes into account the
                 relative positions of (and distances between) cores,
                 last-level caches (LLCs) and memory controllers (MCs)
                 in a manycore system, and generates a mapping of
                 computations to cores with the goal of minimizing the
                 on-chip network traffic. The experimental data
                 collected using a set of 21 multi-threaded applications
                 reveal that, on an average, our approach reduces the
                 on-chip network latency in a 6$ \times $6 manycore
                 system by 38.4\% in the case of private LLCs, and
                 43.8\% in the case of shared LLCs. These improvements
                 translate to the corresponding execution time
                 improvements of 10.9\% and 12.7\% for the private LLC
                 and shared LLC based systems, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '18 proceedings.",
}

@Article{Kondguli:2018:BUS,
  author =       "Sushant Kondguli and Michael Huang",
  title =        "{Bootstrapping}: Using {SMT} Hardware to Improve
                 Single-Thread Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "205--208",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2859945",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Decoupled look-ahead (DLA) architectures have been
                 shown to be an effective way to improve single-thread
                 performance. However, a default implementation requires
                 an additional core. While an SMT flavor is possible, a
                 naive implementation is inefficient and thus slow. In
                 this paper, we propose an optimized implementation
                 called Bootstrapping that makes DLA just as effective
                 on a single (SMT) core as using two cores. While fusing
                 two cores can improve single-thread performance by
                 1.23x, Bootstrapping provides a speedup of 1.51.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kondguli, S (Reprint Author), Univ Rochester, Dept
                 Elect \& Comp Engn, Rochester, NY 14627 USA. Kondguli,
                 Sushant; Huang, Michael, Univ Rochester, Dept Elect \&
                 Comp Engn, Rochester, NY 14627 USA.",
  author-email = "sushant.kondguli@rochester.edu
                 michael.huang@rochester.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1514433, 1533842]",
  funding-text = "This work is supported in part by NSF under grants
                 1514433 and 1533842.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Decoupled look-ahead (DLA) architectures; simultaneous
                 multi-threading (SMT); single thread performance",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Kondguli:2018:BUS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2018:ERD,
  author =       "I-Ting Angelina Lee and Tao B. Schardl",
  title =        "Efficient Race Detection for Reducer Hyperobjects",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3205914",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "A multithreaded Cilk program that is ostensibly
                 deterministic may nevertheless behave
                 nondeterministically due to programming errors in the
                 code. For a Cilk program that uses reducers-a general
                 reduction mechanism supported in various Cilk
                 dialects-such programming errors are especially
                 challenging to debug, because the errors can expose the
                 nondeterminism in how the Cilk runtime system manages
                 reducers. We identify two unique types of races that
                 arise from incorrect use of reducers in a Cilk program,
                 and we present two algorithms to catch these races. The
                 first algorithm, called the Peer-Set algorithm, detects
                 view-read races, which occur when the program attempts
                 to retrieve a value out of a reducer when the read may
                 result in a nondeterministic value, such as before all
                 previously spawned subcomputations that might update
                 the reducer have necessarily returned. The second
                 algorithm, called the SP+ algorithm, detects
                 determinacy races-instances where a write to a memory
                 location occurs logically in parallel with another
                 access to that location-even when the raced-on memory
                 locations relate to reducers. Both algorithms are
                 provably correct, asymptotically efficient, and can be
                 implemented efficiently in practice. We have
                 implemented both algorithms in our prototype race
                 detector, Rader. When running Peer-Set, Rader incurs a
                 geometric-mean multiplicative overhead of 2.56 over
                 running the benchmark without instrumentation. When
                 running SP+, Rader incurs a geometric-mean
                 multiplicative overhead of 16.94.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Liu:2018:ISI,
  author =       "Hongyu Liu and Sam Silvestro and Wei Wang and Chen
                 Tian and Tongping Liu",
  title =        "{iReplayer}: in-situ and identical record-and-replay
                 for multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "344--358",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reproducing executions of multithreaded programs is
                 very challenging due to many intrinsic and external
                 non-deterministic factors. Existing RnR systems achieve
                 significant progress in terms of performance overhead,
                 but none targets the in-situ setting, in which replay
                 occurs within the same process as the recording
                 process. Also, most existing work cannot achieve
                 identical replay, which may prevent the reproduction of
                 some errors. This paper presents iReplayer, which aims
                 to identically replay multithreaded programs in the
                 original process (under the ``in-situ'' setting). The
                 novel in-situ and identical replay of iReplayer makes
                 it more likely to reproduce errors, and allows it to
                 directly employ debugging mechanisms (e.g. watchpoints)
                 to aid failure diagnosis. Currently, iReplayer only
                 incurs 3\% performance overhead on average, which
                 allows it to be always enabled in the production
                 environment. iReplayer enables a range of
                 possibilities, and this paper presents three examples:
                 two automatic tools for detecting buffer overflows and
                 use-after-free bugs, and one interactive debugging tool
                 that is integrated with GDB.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "PLDI '18 proceedings.",
}

@Article{Lochbihler:2018:MTS,
  author =       "Andreas Lochbihler",
  title =        "Mechanising a Type-Safe Model of Multithreaded {Java}
                 with a Verified Compiler",
  journal =      j-J-AUTOM-REASON,
  volume =       "61",
  number =       "1--4",
  pages =        "243--332",
  month =        jun,
  year =         "2018",
  CODEN =        "JAREEW",
  DOI =          "https://doi.org/10.1007/s10817-018-9452-x",
  ISSN =         "0168-7433 (print), 1573-0670 (electronic)",
  ISSN-L =       "0168-7433",
  bibdate =      "Sat Aug 4 07:51:41 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jautomreason.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://link.springer.com/article/10.1007/s10817-018-9452-x",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Automated Reasoning",
  journal-URL =  "http://link.springer.com/journal/10817",
}

@Article{Maabreh:2018:MHT,
  author =       "Majdi Maabreh and Hafez Irshid and Ajay Gupta and
                 Izzat Alasmadi",
  title =        "A multithreading and hashing technique for indexing
                 {Target--Decoy} peptides databases",
  journal =      j-CCPE,
  volume =       "30",
  number =       "9",
  pages =        "??--??",
  day =          "10",
  month =        may,
  year =         "2018",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4371",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Sat Aug 4 10:03:13 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4371",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
}

@InProceedings{Malakhov:2018:CMT,
  author =       "Anton Malakhov and David Liu and Anton Gorshkov and
                 Terry Wilmarth",
  editor =       "Fatih Akici and David Lippa and Dillon Niederhut and M
                 Pacer",
  booktitle =    "Proceedings of the {17th Python in Science Conference,
                 Austin, TX, 9--15 July 2018}",
  title =        "Composable Multi-Threading and Multi-Processing for
                 Numeric Libraries",
  publisher =    "????",
  address =      "????",
  pages =        "15--21",
  year =         "2018",
  bibdate =      "Wed Aug 1 09:03:36 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib",
  URL =          "http://conference.scipy.org/proceedings/scipy2018/anton_malakhov.html",
  abstract =     "Python is popular among scientific communities that
                 value its simplicity and power, especially as it comes
                 along with numeric libraries such as NumPy, SciPy,
                 Dask, and Numba. As CPU core counts keep increasing,
                 these modules can make use of many cores via
                 multi-threading for efficient multi-core parallelism.
                 However, threads can interfere with each other leading
                 to overhead and inefficiency if used together in a
                 single application on machines with a large number of
                 cores. This performance loss can be prevented if all
                 multi-threaded modules are coordinated. This paper
                 continues the work started in AMala16 by introducing
                 more approaches to coordination for both
                 multi-threading and multi-processing cases. In
                 particular, we investigate the use of static settings,
                 limiting the number of simultaneously active OpenMP
                 parallel regions, and optional parallelism with Intel
                 Threading Building Blocks (Intel TBB). We will show how
                 these approaches help to unlock additional performance
                 for numeric applications on multi-core systems.",
  acknowledgement = ack-nhfb,
  keywords =     "Dask; GIL; Joblib; Multi-core; Multi-processing;
                 Multi-threading; Nested Parallelism; NumPy; OpenMP;
                 Oversubscription; Parallel Computations; Python; SciPy;
                 TBB",
}

@Article{Muller:2018:CPG,
  author =       "Stefan K. Muller and Umut A. Acar and Robert Harper",
  title =        "Competitive parallelism: getting your priorities
                 right",
  journal =      j-PACMPL,
  volume =       "2",
  number =       "ICFP",
  pages =        "95:1--95:30",
  month =        jul,
  year =         "2018",
  DOI =          "https://doi.org/10.1145/3236790",
  bibdate =      "Fri Aug 7 17:44:42 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3236790",
  abstract =     "Multi-threaded programs have traditionally fallen into
                 one of two domains: cooperative and competitive. These
                 two domains have traditionally remained mostly
                 disjoint, with cooperative threading used for
                 increasing throughput in compute-intensive \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "95",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Pham:2018:TSM,
  author =       "Binh Pham and Derek Hower and Abhishek Bhattacharjee
                 and Trey Cain",
  title =        "{TLB} Shootdown Mitigation for Low-Power Many-Core
                 Servers with {L1} Virtual Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2712140",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Power efficiency has become one of the most important
                 design constraints for high-performance systems. In
                 this paper, we revisit the design of low-power
                 virtually-addressed caches. While virtually-addressed
                 caches enable significant power savings by obviating
                 the need for Translation Lookaside Buffer (TLB)
                 lookups, they suffer from several challenging design
                 issues that curtail their widespread commercial
                 adoption. We focus on one of these challenges-cache
                 flushes due to virtual page remappings. We use detailed
                 studies on an ARM many-core server to show that this
                 problem degrades performance by up to 25 percent for a
                 mix of multi-programmed and multi-threaded workloads.
                 Interestingly, we observe that many of these flushes
                 are spurious, and caused by an indiscriminate
                 invalidation broadcast on ARM architecture. In
                 response, we propose a low-overhead and readily
                 implementable hardware mechanism using bloom filters to
                 reduce spurious invalidations and mitigate their ill
                 effects.",
  acknowledgement = ack-nhfb,
  affiliation =  "Pham, B (Reprint Author), Rutgers State Univ, Dept
                 Comp Sci, Piscataway, NJ 08854 USA. Binh Pham;
                 Bhattacharjee, Abhishek, Rutgers State Univ, Dept Comp
                 Sci, Piscataway, NJ 08854 USA. Hower, Derek, Qualcomm
                 Technol Inc, Piscataway, NJ 08854 USA. Cain, Trey,
                 Qualcomm Datactr Technol Inc, Piscataway, NJ 08854
                 USA.",
  author-email = "binhpham@rutgers.edu dhower@qti.qualcomm.com
                 abhib@rutgers.edu tcain@qti.qualcomm.com",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "multicores; multiprogramming; multithreading; TLB;
                 Virtual Cache; virtual memory",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Pham:2018:TSM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Polap:2018:MTL,
  author =       "Dawid Polap and Marcin Wo{\'z}niak and Wei Wei and
                 Robertas Damasevicius",
  title =        "Multi-threaded learning control mechanism for neural
                 networks",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "87",
  number =       "??",
  pages =        "16--34",
  month =        oct,
  year =         "2018",
  CODEN =        "FGSEVI",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Tue Jun 26 08:47:57 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.sciencedirect.com/science/article/pii/S0167739X18300931",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Roberts:2018:MID,
  author =       "Malcolm Roberts and John C. Bowman",
  title =        "Multithreaded implicitly dealiased convolutions",
  journal =      j-J-COMPUT-PHYS,
  volume =       "356",
  number =       "??",
  pages =        "98--114",
  day =          "1",
  month =        mar,
  year =         "2018",
  CODEN =        "JCTPAH",
  ISSN =         "0021-9991 (print), 1090-2716 (electronic)",
  ISSN-L =       "0021-9991",
  bibdate =      "Sat Jan 13 12:33:11 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jcomputphys2015.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0021999117308641",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Physics",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00219991",
}

@Article{Sahin:2018:CSC,
  author =       "Semih Sahin and Bugra Gedik",
  title =        "{C-Stream}: a Co-routine-Based Elastic Stream
                 Processing Engine",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3184120",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Stream processing is a computational paradigm for
                 on-the-fly processing of live data. This paradigm lends
                 itself to implementations that can provide high
                 throughput and low latency by taking advantage of
                 various forms of parallelism that are naturally
                 captured by the stream processing model of computation,
                 such as pipeline, task, and data parallelism. In this
                 article, we describe the design and implementation of
                 C-Stream, which is an elastic stream processing engine.
                 C-Stream encompasses three unique properties. First, in
                 contrast to the widely adopted event-based interface
                 for developing streaming operators, C-Stream provides
                 an interface wherein each operator has its own driver
                 loop and relies on data availability application
                 programming interfaces (APIs) to decide when to perform
                 its computations. This self-control-based model
                 significantly simplifies the development of operators
                 that require multiport synchronization. Second,
                 C-Stream contains a dynamic scheduler that manages the
                 multithreaded execution of the operators. The
                 scheduler, which is customizable via plug-ins, enables
                 the execution of the operators as co-routines, using
                 any number of threads. The base scheduler implements
                 back-pressure, provides data availability APIs, and
                 manages preemption and termination handling. Last,
                 C-Stream varies the degree of parallelism to resolve
                 bottlenecks by both dynamically changing the number of
                 threads used to execute an application and adjusting
                 the number of replicas of data-parallel operators. We
                 provide an experimental evaluation of C-Stream. The
                 results show that C-Stream is scalable, highly
                 customizable, and can resolve bottlenecks by
                 dynamically adjusting the level of data parallelism
                 used.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sangaiah:2018:SSA,
  author =       "Karthik Sangaiah and Michael Lui and Radhika Jagtap
                 and Stephan Diestelhorst and Siddharth Nilakantan and
                 Ankit More and Baris Taskin and Mark Hempstead",
  title =        "{SynchroTrace}: Synchronization-Aware
                 Architecture-Agnostic Traces for Lightweight Multicore
                 Simulation of {CMP} and {HPC} Workloads",
  journal =      j-TACO,
  volume =       "15",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158642",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:19:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Trace-driven simulation of chip multiprocessor (CMP)
                 systems offers many advantages over execution-driven
                 simulation, such as reducing simulation time and
                 complexity, allowing portability, and scalability.
                 However, trace-based simulation approaches have
                 difficulty capturing and accurately replaying
                 multithreaded traces due to the inherent nondeterminism
                 in the execution of multithreaded programs. In this
                 work, we present SynchroTrace, a scalable, flexible,
                 and accurate trace-based multithreaded simulation
                 methodology. By recording synchronization events
                 relevant to modern threading libraries (e.g., Pthreads
                 and OpenMP) and dependencies in the traces, independent
                 of the host architecture, the methodology is able to
                 accurately model the nondeterminism of multithreaded
                 programs for different hardware platforms and threading
                 paradigms. Through capturing high-level instruction
                 categories, the SynchroTrace average CPI trace Replay
                 timing model offers fast and accurate simulation of
                 many-core in-order CMPs. We perform two case studies to
                 validate the SynchroTrace simulation flow against the
                 gem5 full-system simulator: (1) a constraint-based
                 design space exploration with traditional CMP
                 benchmarks and (2) a thread-scalability study with
                 HPC-representative applications. The results from these
                 case studies show that (1) our trace-based approach
                 with trace filtering has a peak speedup of up to 18.7$
                 \times $ over simulation in gem5 full-system with an
                 average of 9.6$ \times $ speedup, (2) SynchroTrace
                 maintains the thread-scaling accuracy of gem5 and can
                 efficiently scale up to 64 threads, and (3)
                 SynchroTrace can trace in one platform and model any
                 platform in early stages of design.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Schmitt:2018:RHG,
  author =       "Christian Schmitt and Moritz Schmid and Sebastian
                 Kuckuk and Harald K{\"o}stler and J{\"u}rgen Teich and
                 Frank Hannig",
  title =        "Reconfigurable Hardware Generation of Multigrid
                 Solvers with Conjugate Gradient Coarse-Grid Solution",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "28",
  number =       "04",
  pages =        "??--??",
  month =        dec,
  year =         "2018",
  DOI =          "https://doi.org/10.1142/S0129626418500160",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:05 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626418500160",
  abstract =     "Not only in the field of high-performance computing
                 (HPC), field programmable gate arrays (FPGAs) are a
                 soaringly popular accelerator technology. However, they
                 use a completely different programming paradigm and
                 tool set compared to central processing units (CPUs) or
                 even graphics processing units (GPUs), adding extra
                 development steps and requiring special knowledge,
                 hindering widespread use in scientific computing. To
                 bridge this programmability gap, domain-specific
                 languages (DSLs) are a popular choice to generate
                 low-level implementations from an abstract algorithm
                 description. In this work, we demonstrate our approach
                 for the generation of numerical solver implementations
                 based on the multigrid method for FPGAs from the same
                 code base that is also used to generate code for CPUs
                 using a hybrid parallelization of MPI and OpenMP. Our
                 approach yields in a hardware design that can compute
                 up to 11 V-cycles per second with an input grid size of
                 4096 {\texttimes} \{\texttimes} {\texttimes} 4096 and
                 solution on the coarsest using the conjugate gradient
                 (CG) method on a mid-range FPGA, beating vectorized,
                 multi-threaded execution on an Intel Xeon processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Scionti:2018:EMM,
  author =       "Alberto Scionti and Somnath Mazumdar and Stephane
                 Zuckerman",
  title =        "Enabling Massive Multi-Threading with Fast Hashing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2697863",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The next generation of high-performance computers is
                 expected to execute threads in orders of magnitude
                 higher than today's systems. Improper management of
                 such huge amount of threads can create resource
                 contention, leading to overall degraded system
                 performance. By leveraging more practical approaches to
                 distribute threads on the available resources,
                 execution models and manycore chips are expected to
                 overcome limitations of current systems. Here, we
                 present DELTA --- a Data-Enabled muLti-Threaded
                 Architecture, where a producer-consumer scheme is used
                 to execute threads via complete distributed thread
                 management mechanism. We consider a manycore tiled-chip
                 architecture where Network-on-Chip (NoC) routers are
                 extended to support our execution model. The proposed
                 extension is analysed, while simulation results confirm
                 that DELTA can manage a large number of simultaneous
                 threads, relying on a simple hardware structure.",
  acknowledgement = ack-nhfb,
  affiliation =  "Scionti, A (Reprint Author), ISMB, I-10138 Turin,
                 Italy. Scionti, Alberto, ISMB, I-10138 Turin, Italy.
                 Mazumdar, Somnath, Univ Siena, Siena, SI, Italy.
                 Zuckerman, Stephane, Michigan Technol Univ, Houghton,
                 MI 49931 USA.",
  author-email = "scionti@ismb.it mazumdar@dii.unisi.it
                 szuckerm@mtu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Dataflow; hashing; network-on-chip;
                 thread-scheduling",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Scionti:2018:EMM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tang:2018:CND,
  author =       "Xulong Tang and Mahmut Taylan Kandemir and Hui Zhao
                 and Myoungsoo Jung and Mustafa Karakoy",
  title =        "Computing with Near Data",
  journal =      j-POMACS,
  volume =       "2",
  number =       "3",
  pages =        "42:1--42:30",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3287321",
  ISSN =         "2476-1249",
  ISSN-L =       "2476-1249",
  bibdate =      "Mon Mar 29 10:31:29 MDT 2021",
  bibsource =    "http://portal.acm.org/https://www.math.utah.edu/pub/tex/bib/pomacs.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3287321",
  abstract =     "One cost that plays a significant role in shaping the
                 overall performance of both single-threaded and
                 multi-thread applications in modern computing systems
                 is the cost of moving data between compute elements and
                 storage elements. Traditional approaches \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "Proceedings of the ACM on Measurement and Analysis of
                 Computing Systems (POMACS)",
  journal-URL =  "https://dl.acm.org/loi/pomacs",
}

@Article{Thebault:2018:AMC,
  author =       "Lo{\"\i}c Th{\'e}bault and Eric Petit",
  title =        "Asynchronous and multithreaded communications on
                 irregular applications using vectorized divide and
                 conquer approach",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "114",
  number =       "??",
  pages =        "16--27",
  month =        apr,
  year =         "2018",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Tue Feb 6 13:52:05 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731517303350",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Tian:2018:RSP,
  author =       "Zhenzhou Tian and Ting Liu and Qinghua Zheng and Eryue
                 Zhuang and Ming Fan and Zijiang Yang",
  title =        "Reviving Sequential Program Birthmarking for
                 Multithreaded Software Plagiarism Detection",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "44",
  number =       "5",
  pages =        "491--511",
  month =        may,
  year =         "2018",
  CODEN =        "IESEDJ",
  DOI =          "https://doi.org/10.1109/TSE.2017.2688383",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Jun 14 08:43:22 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://ieeexplore.ieee.org/document/7888597/",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Book{Troutwine:2018:HCR,
  author =       "Brian L. Troutwine",
  title =        "Hands-on Concurrency with {Rust}: Confidently Build
                 Memory-safe, Parallel, and Efficient Software in
                 {Rust}",
  publisher =    pub-PACKT,
  address =      pub-PACKT:adr,
  pages =        "v + 449",
  year =         "2018",
  ISBN =         "1-78839-997-8 (paperback), 1-78847-835-5",
  ISBN-13 =      "978-1-78839-997-5 (paperback), 978-1-78847-835-9",
  LCCN =         "QA76.76.A65",
  bibdate =      "Tue Dec 10 05:53:29 MST 2019",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://proquest.safaribooksonline.com/?fpi=9781788399975",
  abstract =     "Get to grips with modern software demands by learning
                 the effective uses of Rust's powerful memory safety.Key
                 Features Learn and improve the sequential performance
                 characteristics of your software Understand the use of
                 operating system processes in a high-scale concurrent
                 system Learn of the various coordination methods
                 available in the Standard library. Most programming
                 languages can really complicate things, especially with
                 regard to unsafe memory access. The burden on you, the
                 programmer, lies across two domains: understanding the
                 modern machine and your language's pain-points. This
                 book will teach you to how to manage program
                 performance on modern machines and build fast,
                 memory-safe, and concurrent software in Rust. It starts
                 with the fundamentals of Rust and discusses machine
                 architecture concepts. You will be taken through ways
                 to measure and improve the performance of Rust code
                 systematically and how to write collections with
                 confidence. You will learn about the Sync and Send
                 traits applied to threads, and coordinate thread
                 execution with locks, atomic primitives,
                 data-parallelism, and more.The book will show you how
                 to efficiently embed Rust in C++ code and explore the
                 functionalities of various crates for multithreaded
                 applications. It explores implementations in depth. You
                 will know how a mutex works and build several yourself.
                 You will master radically different approaches that
                 exist in the ecosystem for structuring and managing
                 high-scale systems. By the end of the book, you will
                 feel comfortable with designing safe, consistent,
                 parallel, and high-performance applications in
                 Rust.What you will learn Probe your programs for
                 performance and accuracy issues Create your own
                 threading and multi-processing environment in Rust Use
                 coarse locks from Rust's Standard library Solve common
                 synchronization problems or avoid synchronization using
                 atomic programming Build lock-free/wait-free structures
                 in Rust and understand their implementations in the
                 crates ecosystem Leverage Rust's memory model and type
                 system to build safety properties into your parallel
                 programs Understand the new features of the Rust
                 programming language to ease the writing of parallel
                 programs. Who this book is for. This book is aimed at
                 software engineers with a basic understanding of Rust
                 who want to exploit the parallel and concurrent nature
                 of modern computing environments, safely.",
  acknowledgement = ack-nhfb,
  libnote =      "Not in my library.",
  subject =      "Application software; Development; Computer
                 multitasking; Programming languages (Electronic
                 computers); Portable and handheld devices:
                 consumer/user guides; Mobile phones: consumer/user
                 guides; Parallel processing; Programming and scripting
                 languages: general; Computers; Programming; Parallel;
                 Hardware; Handheld Devices; Programming Languages; C;
                 Development; Computer multitasking; Programming
                 languages (Electronic computers)",
}

@Article{Wang:2018:TWB,
  author =       "Jui-Hsien Wang and Ante Qu and Timothy R. Langlois and
                 Doug L. James",
  title =        "Toward wave-based sound synthesis for computer
                 animation",
  journal =      j-TOG,
  volume =       "37",
  number =       "4",
  pages =        "109:1--109:??",
  month =        aug,
  year =         "2018",
  CODEN =        "ATGRDF",
  DOI =          "https://doi.org/10.1145/3197517.3201318",
  ISSN =         "0730-0301 (print), 1557-7368 (electronic)",
  ISSN-L =       "0730-0301",
  bibdate =      "Thu Nov 29 17:19:43 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tog.bib",
  abstract =     "We explore an integrated approach to sound generation
                 that supports a wide variety of physics-based
                 simulation models and computer-animated phenomena.
                 Targeting high-quality offline sound synthesis, we seek
                 to resolve animation-driven sound radiation with
                 near-field scattering and diffraction effects. The core
                 of our approach is a sharp-interface finite-difference
                 time-domain (FDTD) wavesolver, with a series of
                 supporting algorithms to handle rapidly deforming and
                 vibrating embedded interfaces arising in physics-based
                 animation sound. Once the solver rasterizes these
                 interfaces, it must evaluate acceleration boundary
                 conditions (BCs) that involve model-and
                 phenomena-specific computations. We introduce acoustic
                 shaders as a mechanism to abstract away these
                 complexities, and describe a variety of implementations
                 for computer animation: near-rigid objects with ringing
                 and acceleration noise, deformable (finite element)
                 models such as thin shells, bubble-based water, and
                 virtual characters. Since time-domain wave synthesis is
                 expensive, we only simulate pressure waves in a small
                 region about each sound source, then estimate a
                 far-field pressure signal. To further improve
                 scalability beyond multi-threading, we propose a fully
                 time-parallel sound synthesis method that is
                 demonstrated on commodity cloud computing resources. In
                 addition to presenting results for multiple animation
                 phenomena (water, rigid, shells, kinematic deformers,
                 etc.) we also propose 3D automatic dialogue replacement
                 (3DADR) for virtual characters so that pre-recorded
                 dialogue can include character movement, and near-field
                 shadowing and scattering sound effects.",
  acknowledgement = ack-nhfb,
  articleno =    "109",
  fjournal =     "ACM Transactions on Graphics",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J778",
}

@Article{Abdulla:2019:OSM,
  author =       "Parosh Aziz Abdulla and Mohamed Faouzi Atig and Bengt
                 Jonsson and Magnus L{\aa}ng and Tuan Phong Ngo and
                 Konstantinos Sagonas",
  title =        "Optimal stateless model checking for reads-from
                 equivalence under sequential consistency",
  journal =      j-PACMPL,
  volume =       "3",
  number =       "OOPSLA",
  pages =        "150:1--150:29",
  month =        oct,
  year =         "2019",
  DOI =          "https://doi.org/10.1145/3360576",
  bibdate =      "Fri Aug 7 19:22:30 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360576",
  abstract =     "We present a new approach for stateless model checking
                 (SMC) of multithreaded programs under Sequential
                 Consistency (SC) semantics. To combat state-space
                 explosion, SMC is often equipped with a partial-order
                 reduction technique, which defines an \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "150",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Amestoy:2019:PSB,
  author =       "Patrick R. Amestoy and Alfredo Buttari and Jean-Yves
                 L'Excellent and Theo Mary",
  title =        "Performance and Scalability of the Block Low-Rank
                 Multifrontal Factorization on Multicore Architectures",
  journal =      j-TOMS,
  volume =       "45",
  number =       "1",
  pages =        "2:1--2:26",
  month =        mar,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3242094",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon May 6 18:23:42 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3242094",
  abstract =     "Matrices coming from elliptic Partial Differential
                 Equations have been shown to have a low-rank property
                 that can be efficiently exploited in multifrontal
                 solvers to provide a substantial reduction of their
                 complexity. Among the possible low-rank formats, the
                 Block Low-Rank format (BLR) is easy to use in a general
                 purpose multifrontal solver and its potential compared
                 to standard (full-rank) solvers has been demonstrated.
                 Recently, new variants have been introduced and it was
                 proved that they can further reduce the complexity but
                 their performance has never been analyzed. In this
                 article, we present a multithreaded BLR factorization
                 and analyze its efficiency and scalability in
                 shared-memory multicore environments. We identify the
                 challenges posed by the use of BLR approximations in
                 multifrontal solvers and put forward several
                 algorithmic variants of the BLR factorization that
                 overcome these challenges by improving its efficiency
                 and scalability. We illustrate the performance analysis
                 of the BLR multifrontal factorization with numerical
                 experiments on a large set of problems coming from a
                 variety of real-life applications.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Areias:2019:MDL,
  author =       "Miguel Areias and Ricardo Rocha",
  title =        "Multi-dimensional lock-free arrays for multithreaded
                 mode-directed tabling in {Prolog}",
  journal =      j-CCPE,
  volume =       "31",
  number =       "5",
  pages =        "e4491:1--e4491:??",
  day =          "10",
  month =        mar,
  year =         "2019",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.4491",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Thu Mar 28 08:07:55 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "30 March 2018",
}

@Article{Asyabi:2019:COS,
  author =       "Esmail Asyabi and Erfan Sharafzadeh and SeyedAlireza
                 SanaeeKohroudi and Mohsen Sharifi",
  title =        "{CTS}: an operating system {CPU} scheduler to mitigate
                 tail latency for latency-sensitive multi-threaded
                 applications",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "133",
  number =       "??",
  pages =        "232--243",
  month =        nov,
  year =         "2019",
  CODEN =        "JPDCER",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Fri Sep 13 10:25:21 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731518302387",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Bajczi:2019:WMP,
  author =       "Levente Bajczi and Andr{\'a}s V{\"o}r{\"o}s and Vince
                 Moln{\'a}r",
  title =        "Will My Program Break on This Faulty Processor?:
                 {Formal} Analysis of Hardware Fault Activations in
                 Concurrent Embedded Software",
  journal =      j-TECS,
  volume =       "18",
  number =       "5s",
  pages =        "89:1--89:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358238",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3358238",
  abstract =     "Formal verification is approaching a point where it
                 will be reliably applicable to embedded software. Even
                 though formal verification can efficiently analyze
                 multi-threaded applications, multi-core processors are
                 often considered too dangerous to use in critical
                 systems, despite the many benefits they can offer. One
                 reason is the advanced memory consistency model of such
                 CPUs. Nowadays, most software verifiers assume strict
                 sequential consistency, which is also the na{\"\i}ve
                 view of programmers. Modern multi-core processors,
                 however, rarely guarantee this assumption by default.
                 In addition, complex processor architectures may easily
                 contain design faults. Thanks to the recent advances in
                 hardware verification, these faults are increasingly
                 visible and can be detected even in existing
                 processors, giving an opportunity to compensate for the
                 problem in software. In this paper, we propose a
                 generic approach to consider inconsistent behavior of
                 the hardware in the analysis of software. Our approach
                 is based on formal methods and can be used to detect
                 the activation of existing hardware faults on the
                 application level and facilitate their mitigation in
                 software. The approach relies heavily on recent results
                 of model checking and hardware verification and offers
                 new, integrative research directions. We propose a
                 partial solution based on existing model checking tools
                 to demonstrate feasibility and evaluate their
                 performance in this context.",
  acknowledgement = ack-nhfb,
  articleno =    "89",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Balkind:2019:OOS,
  author =       "Jonathan Balkind and Michael McKeown and Yaosheng Fu
                 and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and
                 Mohammad Shahrad and Adi Fuchs and Samuel Payne and
                 Xiaohua Liang and Matthew Matl and David Wentzlaff",
  title =        "{OpenPiton}: an open source hardware platform for your
                 research",
  journal =      j-CACM,
  volume =       "62",
  number =       "12",
  pages =        "79--87",
  month =        dec,
  year =         "2019",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/3366343",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Mon Nov 25 09:55:53 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://cacm.acm.org/magazines/2019/12/241058/fulltext",
  abstract =     "Industry is building larger, more complex, manycore
                 processors on the back of strong institutional
                 knowledge, but academic projects face difficulties in
                 replicating that scale. To alleviate these difficulties
                 and to develop and share knowledge, the community needs
                 open architecture frameworks for simulation, chip
                 design, and software exploration that support
                 extensibility, scalability, and configurability,
                 alongside an established base of verification tools and
                 supported software. In this article, we present
                 OpenPiton, an open source framework for building
                 scalable architecture research prototypes from one core
                 to 500 million cores. OpenPiton is the world's first
                 open source, general-purpose, multithreaded manycore
                 processor, and framework. OpenPiton is highly
                 configurable, providing a rich design space spanning a
                 variety of hardware parameters that researchers can
                 change. OpenPiton designs can be emulated on FPGAs,
                 where they can run full-stack multiuser Debian Linux.
                 OpenPiton is designed to scale to very large core
                 fabrics, enabling researchers to measure operating
                 system, compiler, and software scalability. The mature
                 code-base reflects the complexity of an
                 industrial-grade design and provides the necessary
                 scripts to build new chips, making OpenPiton a natural
                 choice for computer-aided design (CAD) research.
                 OpenPiton has been validated with a 25-core chip
                 prototype, named Piton, and is bolstered by a
                 validation suite that has thousands of tests, providing
                 an environment to test new hardware designs while
                 verifying the correctness of the whole system.
                 OpenPiton is being actively used in research both
                 internally to Princeton and in the wider community, as
                 well as being adopted in education, industry, and
                 government settings.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J79",
}

@Article{Bonizzoni:2019:MMB,
  author =       "Paola Bonizzoni and Gianluca Della Vedova and Yuri
                 Pirola and Marco Previtali and Raffaella Rizzi",
  title =        "Multithread Multistring {Burrows--Wheeler} Transform
                 and Longest Common Prefix Array",
  journal =      j-J-COMPUT-BIOL,
  volume =       "26",
  number =       "9",
  pages =        "948--961",
  month =        sep,
  year =         "2019",
  CODEN =        "JCOBEM",
  DOI =          "https://doi.org/10.1089/cmb.2018.0230",
  ISSN =         "1066-5277 (print), 1557-8666 (electronic)",
  ISSN-L =       "1066-5277",
  bibdate =      "Tue Oct 8 06:02:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jcomputbiol.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.liebertpub.com/doi/abs/10.1089/cmb.2018.0230;
                 https://www.liebertpub.com/doi/pdf/10.1089/cmb.2018.0230",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Computational Biology",
  journal-URL =  "https://www.liebertpub.com/loi/cmb/",
  onlinedate =   "29 May 2019",
}

@Article{Bouksiaa:2019:UDE,
  author =       "M. S. M. Bouksiaa and F. Trahay and A. Lescouet and G.
                 Voron and R. Dulong and A. Guermouche and {\'E}. Brunet
                 and G. Thomas",
  title =        "Using Differential Execution Analysis to Identify
                 Thread Interference",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "12",
  pages =        "2866--2878",
  month =        dec,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2927481",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Dec 19 09:20:35 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "bottleneck detection; Energy storage; Generators;
                 multithreading; Performance analysis; Power system
                 stability; Real-time systems; Renewable energy sources;
                 Supply and demand",
}

@Article{Brais:2019:AAM,
  author =       "Hadi Brais and Preeti Ranjan Panda",
  title =        "{Alleria}: an Advanced Memory Access Profiling
                 Framework",
  journal =      j-TECS,
  volume =       "18",
  number =       "5s",
  pages =        "81:1--81:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358193",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3358193",
  abstract =     "Application analysis and simulation tools are used
                 extensively by embedded system designers to improve
                 existing optimization techniques or develop new ones.
                 We propose the Alleria framework to make it easier for
                 designers to comprehensively collect critical
                 information such as virtual and physical memory
                 addresses, accessed values, and thread schedules about
                 one or more target applications. Such profilers often
                 incur substantial performance overheads that are orders
                 of magnitude larger than native execution time. We
                 discuss how that overhead can be significantly reduced
                 using a novel profiling mechanism called adaptive
                 profiling. We develop a heuristic-based adaptive
                 profiling mechanism and evaluate its performance using
                 single-threaded and multi-threaded applications. The
                 proposed technique can improve profiling throughput by
                 up to 145\% and by 37\% on an average, enabling Alleria
                 to be used to comprehensively profile applications with
                 a throughput of over 3 million instructions per
                 second.",
  acknowledgement = ack-nhfb,
  articleno =    "81",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Budhkar:2019:AMD,
  author =       "Prerna Budhkar and Ildar Absalyamov and Vasileios Zois
                 and Skyler Windh and Walid A. Najjar and Vassilis J.
                 Tsotras",
  title =        "Accelerating In-Memory Database Selections Using
                 Latency Masking Hardware Threads",
  journal =      j-TACO,
  volume =       "16",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310229",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jul 26 14:25:54 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Inexpensive DRAMs have created new opportunities for
                 in-memory data analytics. However, the major bottleneck
                 in such systems is high memory access latency.
                 Traditionally, this problem is solved with large cache
                 hierarchies that only benefit regular applications.
                 Alternatively, many data-intensive applications exhibit
                 irregular behavior. Hardware multithreading can better
                 cope with high latency seen in such applications. This
                 article implements a multithreaded prototype (MTP) on
                 FPGAs for the relational selection operator that
                 exhibits control flow irregularity. On a standard TPC-H
                 query evaluation, MTP achieves a bandwidth utilization
                 of 83\%, while the CPU and the GPU implementations
                 achieve 61\% and 64\%, respectively. Besides being
                 bandwidth efficient, MTP is also $ 14.2 \times $ and $
                 4.2 \times $ more power efficient than CPU and GPU,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Carroll:2019:ACM,
  author =       "Shane Carroll and Wei-ming Lin",
  title =        "Applied On-Chip Machine Learning for Dynamic Resource
                 Control in Multithreaded Processors",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "29",
  number =       "03",
  pages =        "??--??",
  month =        sep,
  year =         "2019",
  DOI =          "https://doi.org/10.1142/S0129626419500130",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:09 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626419500130",
  abstract =     "In this paper, we propose a machine learning algorithm
                 to control instruction fetch bandwidth in a
                 simultaneous multithreaded CPU. In a simultaneous
                 multithreaded CPU, multiple threads occupy pools of
                 hardware resources in the same clock cycle. Under some
                 conditions, one or more threads may undergo a period of
                 inefficiency, e.g., a cache miss, thereby inefficiently
                 using shared resources and degrading the performance of
                 other threads. If these inefficiencies can be
                 identified at runtime, the offending thread can be
                 temporarily blocked from fetching new instructions into
                 the pipeline and given time to recover from its
                 inefficiency, and prevent the shared system resources
                 from being wasted on a stalled thread. In this paper,
                 we propose a machine learning approach to determine
                 when a thread should be blocked from fetching new
                 instructions. The model is trained offline and the
                 parameters embedded in a CPU, which can be queried with
                 runtime statistics to determine if a thread is running
                 inefficiently and should be temporarily blocked from
                 fetching. We propose two models: a simple linear model
                 and a higher-capacity neural network. We test each
                 model in a simulation environment and show that system
                 performance can increase by up to 19\% on average with a
                 feasible implementation of the proposed algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Carroll:2019:RRT,
  author =       "Shane Carroll and Wei-ming Lin",
  title =        "Round Robin Thread Selection Optimization in
                 Multithreaded Processors",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "29",
  number =       "01",
  pages =        "??--??",
  month =        mar,
  year =         "2019",
  DOI =          "https://doi.org/10.1142/S0129626419500038",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:06 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626419500038",
  abstract =     "We propose a variation of round-robin ordering in an
                 multi-threaded pipeline to increase system throughput
                 and resource distribution fairness. We show that using
                 round robin with a typical arbitrary ordering results
                 in inefficient use of shared resources and subsequent
                 thread starvation. To address this but still use a
                 simple round-robin approach, we optimally and
                 dynamically sort the order of the round robin
                 periodically at runtime. We show that with 4-threaded
                 workloads, throughput can be improved by over 9\% and
                 harmonic throughput by over 3\% by sorting thread order
                 at run time. We experiment with multiple stages of the
                 pipeline and show consistent results throughout several
                 experiments using the SPEC CPU 2006 benchmarks.
                 Furthermore, since the technique is still a simple
                 round robin, the increased performance requires little
                 overhead to implement.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Fraguela:2019:EDP,
  author =       "B. B. Fraguela and D. Andrade",
  title =        "Easy Dataflow Programming in Clusters with {UPC++}
                 {DepSpawn}",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "6",
  pages =        "1267--1282",
  month =        jun,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2018.2884716",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "application program interfaces; arbitrarily complex
                 task-parallel codes; Arrays; C++ languages; data flow
                 analysis; dataflow; dataflow approach; distributed
                 memory; distributed memory systems; easy dataflow
                 programming; Electronics packaging; host language;
                 implied uncertainties; interoperability; Libraries;
                 message passing; multi-threading; multithreading;
                 parallel processing; parallel programming; parallel
                 programming models; partitioned global address space
                 programming model; PGAS libraries; PGAS UPC++ library;
                 programmability; Programming; Proposals; relevant
                 proposals; software libraries; Task analysis;
                 traditional message-passing paradigm; UPC++ DepSpawn",
}

@Article{Gueunet:2019:TBA,
  author =       "C. Gueunet and P. Fortin and J. Jomier and J. Tierny",
  title =        "Task-Based Augmented Contour Trees with {Fibonacci}
                 Heaps",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "8",
  pages =        "1889--1905",
  month =        aug,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2898436",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fibquart.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "computation procedure; contour tree based
                 applications; Data analysis; data segmentation
                 applications; data structures; Data structures; data
                 visualisation; Data visualization; fast shared memory;
                 Fibonacci heaps; independent local tasks; intermediate
                 data structures; join split trees; multi-core
                 architecture; multi-threading; multicore computation;
                 OpenMP task runtime; parallel algorithm; parallel
                 algorithms; Parallel algorithms; parallel thanks;
                 Runtime; Scientific visualization; Task analysis; task
                 parallelism; task-based augmented contour trees;
                 topological data analysis; tree algorithm; trees
                 (mathematics)",
}

@Article{Herdt:2019:CSB,
  author =       "Vladimir Herdt and Hoang M. Le and Daniel Gro{\ss}e
                 and Rolf Drechsler",
  title =        "Combining sequentialization-based verification of
                 multi-threaded {C} programs with symbolic {Partial
                 Order Reduction}",
  journal =      j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER,
  volume =       "21",
  number =       "5",
  pages =        "545--565",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s10009-019-00507-5",
  ISSN =         "1433-2779 (print), 1433-2787 (electronic)",
  ISSN-L =       "1433-2779",
  bibdate =      "Fri Oct 11 15:05:00 MDT 2019",
  bibsource =    "http://link.springer.com/journal/10009/21/5;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sttt.bib",
  URL =          "https://link.springer.com/article/10.1007/s10009-019-00507-5",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal on Software Tools for Technology
                 Transfer (STTT)",
  journal-URL =  "http://link.springer.com/journal/10009",
}

@Article{Iliakis:2019:LIG,
  author =       "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
                 Soudris",
  title =        "{LOOG}: Improving {GPU} Efficiency With Light-Weight
                 Out-Of-Order Execution",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "166--169",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2951161",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "GPUs are one of the most prevalent platforms for
                 accelerating general-purpose workloads due to their
                 intuitive programming model, computing capacity, and
                 cost-effectiveness. GPUs rely on massive
                 multi-threading and fast context switching to overlap
                 computations with memory operations. Among the diverse
                 GPU workloads, there exists a class of kernels that
                 fail to maintain a sufficient number of active warps to
                 hide the latency of memory operations, and thus suffer
                 from frequent stalling. We observe that these kernels
                 will benefit from increased levels of Instruction-Level
                 Parallelism and we propose a novel architecture with
                 lightweight Out-Of-Order execution capability. To
                 minimize hardware overheads, we carefully design our
                 extension to highly re-use the existing
                 micro-architectural structures. We show that the
                 proposed architecture outperforms traditional platforms
                 by 15 to 46 percent on average for low occupancy
                 kernels, with an area overhead of 0.74 to 3.94 percent.
                 Finally, we prove the potential of our proposal as a
                 GPU u-arch alternative, by providing a 5 percent
                 speedup over a wide collection of 63 general-purpose
                 kernels with as little as 0.74 percent area overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Copper; GPGPU; Graphics processing units; Kernel;
                 micro-architecture; Out of order; Out-of-Order
                 execution; Radio access technologies; Radio frequency;
                 Registers",
}

@Article{Jia:2019:UPD,
  author =       "Z. Jia and W. Gao and Y. Shi and S. A. McKee and Z. Ji
                 and J. Zhan and L. Wang and L. Zhang",
  title =        "Understanding Processors Design Decisions for Data
                 Analytics in Homogeneous Data Centers",
  journal =      j-IEEE-TRANS-BIG-DATA,
  volume =       "5",
  number =       "1",
  pages =        "81--94",
  month =        mar,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/TBDATA.2017.2758792",
  ISSN =         "2332-7790",
  bibdate =      "Fri Aug 2 11:24:47 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetransbigdata.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Big Data",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6687317",
  keywords =     "Big Data; big data; brawny multicore processors;
                 Clocks; computational performance; computer centres;
                 Data analysis; data analysis; Data analytics; data
                 analytics workloads; data center systems; energy
                 conservation; energy efficiency; energy-efficiency;
                 homogeneous data centers; many-core processors;
                 multi-threading; Multicore processing; multiprocessing
                 systems; performance; performance-cost efficiency;
                 Pipelines; power aware computing; processor design
                 decisions; processor evaluation; Program processors;
                 simultaneous multithreading",
}

@Book{Klabnik:2019:RPL,
  author =       "Steve Klabnik and Carol Nichols",
  title =        "The {Rust} programming language",
  publisher =    pub-NO-STARCH,
  address =      pub-NO-STARCH:adr,
  edition =      "Second",
  pages =        "xxix + 526",
  year =         "2019",
  ISBN =         "1-09-812253-4, 1-71850-044-0 (paperback)",
  ISBN-13 =      "978-1-09-812253-9, 978-1-71850-044-0 (paperback)",
  LCCN =         "QA76.73.R87",
  bibdate =      "Fri Nov 8 05:59:02 MST 2019",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/master.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://proquest.safaribooksonline.com/?fpi=9781098122539;
                 https://nostarch.com/download/samples/RustProgrammingLanguage2018_Sample_ToC.pdf;
                 https://nostarch.com/Rust2018",
  abstract =     "\booktitle{The Rust Programming Language} is the
                 official book on Rust: an open source systems
                 programming language that helps you write faster, more
                 reliable software. Rust offers control over low-level
                 details (such as memory usage) in combination with
                 high-level ergonomics, eliminating the hassle
                 traditionally associated with low-level languages. The
                 authors of \booktitle{The Rust Programming Language},
                 members of the Rust Core Team, share their knowledge
                 and experience to show you how to take full advantage
                 of Rust's features-from installation to creating robust
                 and scalable programs. You'll begin with basics like
                 creating functions, choosing data types, and binding
                 variables and then move on to more advanced concepts,
                 such as: * Ownership and borrowing, lifetimes, and
                 traits * Using Rust's memory safety guarantees to build
                 fast, safe programs; * Testing, error handling, and
                 effective refactoring; * Generics, smart pointers,
                 multithreading, trait objects, and advanced pattern
                 matching; * Using Cargo, Rust's built-in package
                 manager, to build, test, and document your code and
                 manage dependencies; * How best to use Rust's advanced
                 compiler with compiler-led programming techniques
                 You'll find plenty of code examples throughout the
                 book, as well as three chapters dedicated to building
                 complete projects to test your learning: a number
                 guessing game, a Rust implementation of a command line
                 tool, and a multithreaded server. New to this edition:
                 An extended section on Rust macros, an expanded chapter
                 on modules, and appendixes on Rust development tools
                 and editions.",
  acknowledgement = ack-nhfb,
  libnote =      "Not in my library.",
  subject =      "Rust (Computer program language); Computer
                 programming; Computer programming.; Rust (Computer
                 program language)",
  tableofcontents = "1: Getting started \\
                 2: Programming a guessing game \\
                 3: Common programming concepts \\
                 4: Understanding ownership \\
                 5: Using structs to structure related data \\
                 6: Enums and pattern matching \\
                 7: Managing growing projects with packages, crates, and
                 modules \\
                 8: Common collections \\
                 9: Error handling \\
                 10: Generic types, traits, and lifetimes \\
                 11: Writing automated tests \\
                 12: An I/O project: building a command line program \\
                 13: Functional language features: iterators and
                 closures \\
                 14: More about Cargo and Crates.io \\
                 15: Smart pointers \\
                 16: Fearless concurrency \\
                 17: Object-oriented programming features of Rust \\
                 18: Patterns and matching \\
                 19: Advanced features \\
                 20: Final project: building a multithreaded web server
                 \\
                 Appendix A: Keywords \\
                 Appendix B: Operators and Symbols \\
                 Appendix C: Derivable Traits \\
                 Appendix D: Useful Development Tools \\
                 Appendix E: Editions \\
                 Index",
}

@TechReport{Laguna:2019:GPD,
  author =       "Ignacio Laguna and Paul C. Wood and Ranvijay Singh and
                 Saurabh Bagchi",
  title =        "{GPUMixer}: Performance-Driven Floating-Point Tuning
                 for {GPU} Scientific Applications",
  type =         "Report",
  institution =  "Lawrence Livermore National Laboratory",
  address =      "Livermore CA 94550, USA",
  year =         "2019",
  bibdate =      "Tue Aug 06 05:54:23 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "http://lagunaresearch.org/docs/isc-2019.pdf;
                 https://www.hpcwire.com/2019/08/05/llnl-purdue-researchers-harness-gpu-mixed-precision-for-accuracy-performance-tradeoff/",
  abstract =     "We present GPUMixer, a tool to perform mixed-precision
                 floating-point tuning on scientific GPU applications.
                 While precision tuning techniques are available, they
                 are designed for serial programs and are
                 accuracy-driven, i.e., they consider configurations
                 that satisfy accuracy constraints, but these
                 configurations may degrade performance. GPUMixer, in
                 contrast, presents a performance-driven approach for
                 tuning. We introduce a novel static analysis that finds
                 Fast Imprecise Sets (FISets), sets of operations on low
                 precision that minimize type conversions, which often
                 yield performance speedups. To estimate the relative
                 error introduced by GPU mixed-precision, we propose
                 shadow computations analysis for GPUs, the first of
                 this class for multi-threaded applications. GPUMixer
                 obtains performance improvements of up to 46.4\% of the
                 ideal speedup in comparison to only 20.7\% found by
                 state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  remark =       "Best paper award at the 33rd ISC High Performance
                 conference held June 16--20, 2019.",
}

@Article{Li:2019:HSG,
  author =       "Yuxiang Li and Yinliang Zhao and Liyu Sun and Mengjuan
                 Shen",
  title =        "A hybrid sample generation approach in speculative
                 multithreading",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "75",
  number =       "8",
  pages =        "4193--4225",
  month =        aug,
  year =         "2019",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-017-2118-3",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Thu Oct 10 15:31:22 MDT 2019",
  bibsource =    "http://link.springer.com/journal/11227/75/8;
                 https://www.math.utah.edu/pub/tex/bib/jsuper.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Li:2019:SRM,
  author =       "Y. Li and K. Nomura and J. A. Insley and V. Morozov
                 and K. Kumaran and N. A. Romero and W. A. Goddard and
                 R. K. Kalia and A. Nakano and P. Vashishta",
  title =        "Scalable Reactive Molecular Dynamics Simulations for
                 Computational Synthesis",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "21",
  number =       "5",
  pages =        "64--75",
  month =        sep,
  year =         "2019",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2018.110150043",
  ISSN =         "1521-9615 (print), 1558-366x (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Mon Aug 19 06:40:58 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
  keywords =     "Computational modeling; Computer science; computer
                 system implementation mathematics of computing;
                 computing methodologies; data; general; large and
                 medium ( mainframe ) computers; Materials science and
                 technology; Mathematical model; modeling and
                 prediction; Multithreading; numerical analysis;
                 Numerical models; operating systems; parallel
                 algorithms; performance; Predictive models; simulation
                 theory; simulation, modeling, and visualization;
                 software; software engineering; super (very large)
                 computers; system applications and experience; theory
                 of computation; types of simulation",
}

@Article{Li:2019:TBH,
  author =       "Bing Li and Mengjie Mao and Xiaoxiao Liu and Tao Liu
                 and Zihao Liu and Wujie Wen and Yiran Chen and Hai
                 (Helen) Li",
  title =        "Thread Batching for High-performance Energy-efficient
                 {GPU} Memory Design",
  journal =      j-JETC,
  volume =       "15",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3330152",
  ISSN =         "1550-4832",
  bibdate =      "Tue Dec 17 07:50:24 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3330152",
  abstract =     "Massive multi-threading in GPU imposes tremendous
                 pressure on memory subsystems. Due to rapid growth in
                 thread-level parallelism of GPU and slowly improved
                 peak memory bandwidth, memory becomes a bottleneck of
                 GPU's performance and energy efficiency. In this
                 article, we propose an integrated architectural scheme
                 to optimize the memory accesses and therefore boost the
                 performance and energy efficiency of GPU. First, we
                 propose a thread batch enabled memory partitioning
                 (TEMP) to improve GPU memory access parallelism. In
                 particular, TEMP groups multiple thread blocks that
                 share the same set of pages into a thread batch and
                 applies a page coloring mechanism to bound each stream
                 multiprocessor (SM) to the dedicated memory banks.
                 After that, TEMP dispatches the thread batch to an SM
                 to ensure high-parallel memory-access streaming from
                 the different thread blocks. Second, a thread
                 batch-aware scheduling (TBAS) scheme is introduced to
                 improve the GPU memory access locality and to reduce
                 the contention on memory controllers and
                 interconnection networks. Experimental results show
                 that the integration of TEMP and TBAS can achieve up to
                 10.3\% performance improvement and 11.3\% DRAM energy
                 reduction across diverse GPU applications. We also
                 evaluate the performance interference of the mixed
                 CPU+GPU workloads when they are run on a heterogeneous
                 system that employs our proposed schemes. Our results
                 show that a simple solution can effectively ensure the
                 efficient execution of both GPU and CPU applications.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mironov:2019:MPE,
  author =       "Vladimir Mironov and Yuri Alexeev and Dmitri G.
                 Fedorov",
  title =        "Multithreaded parallelization of the energy and
                 analytic gradient in the fragment molecular orbital
                 method",
  journal =      j-IJQC,
  volume =       "119",
  number =       "12",
  pages =        "e25937:1--e25937:??",
  day =          "15",
  month =        jun,
  year =         "2019",
  CODEN =        "IJQCB2",
  DOI =          "https://doi.org/10.1002/qua.25937",
  ISSN =         "0020-7608 (print), 1097-461X (electronic)",
  ISSN-L =       "0020-7608",
  bibdate =      "Wed Oct 9 06:14:07 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ijqc2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "International Journal of Quantum Chemistry",
  journal-URL =  "http://www.interscience.wiley.com/jpages/0020-7608/",
  onlinedate =   "26 April 2019",
}

@Article{Oz:2019:SMA,
  author =       "Isil Oz and Sanem Arslan",
  title =        "A Survey on Multithreading Alternatives for Soft Error
                 Fault Tolerance",
  journal =      j-COMP-SURV,
  volume =       "52",
  number =       "2",
  pages =        "27:1--27:??",
  month =        may,
  year =         "2019",
  CODEN =        "CMSVAN",
  DOI =          "https://doi.org/10.1145/3302255",
  ISSN =         "0360-0300 (print), 1557-7341 (electronic)",
  ISSN-L =       "0360-0300",
  bibdate =      "Sat Aug 31 09:04:37 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compsurv.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3302255",
  abstract =     "Smaller transistor sizes and reduction in voltage
                 levels in modern microprocessors induce higher soft
                 error rates. This trend makes reliability a primary
                 design constraint for computer systems. Redundant
                 multithreading (RMT) makes use of parallelism in modern
                 systems by employing thread-level time redundancy for
                 fault detection and recovery. RMT can detect faults by
                 running identical copies of the program as separate
                 threads in parallel execution units with identical
                 inputs and comparing their outputs. In this article, we
                 present a survey of RMT implementations at different
                 architectural levels with several design
                 considerations. We explain the implementations in
                 seminal papers and their extensions and discuss the
                 design choices employed by the techniques. We review
                 both hardware and software approaches by presenting the
                 main characteristics and analyze the studies with
                 different design choices regarding their strengths and
                 weaknesses. We also present a classification to help
                 potential users find a suitable method for their
                 requirement and to guide researchers planning to work
                 on this area by providing insights into the future
                 trend.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Computing Surveys",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J204",
}

@Article{Roth:2019:AOC,
  author =       "{\'A}goston R{\'o}th",
  title =        "Algorithm 992: An {OpenGL}- and {C++}-based Function
                 Library for Curve and Surface Modeling in a Large Class
                 of Extended {Chebyshev} Spaces",
  journal =      j-TOMS,
  volume =       "45",
  number =       "1",
  pages =        "13:1--13:32",
  month =        mar,
  year =         "2019",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3284979",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Mon May 6 18:23:42 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/toms.bib",
  URL =          "https://dl.acm.org/citation.cfm?id=3284979",
  abstract =     "We propose a platform-independent multi-threaded
                 function library that provides data structures to
                 generate, differentiate, and render both the ordinary
                 basis and the normalized B-basis of a user-specified
                 extended Chebyshev (EC) space that comprises the
                 constants and can be identified with the solution space
                 of a constant-coefficient homogeneous linear
                 differential equation defined on a sufficiently small
                 interval. Using the obtained normalized B-bases, our
                 library can also generate, (partially) differentiate,
                 modify, and visualize a large family of so-called
                 B-curves and tensor product B-surfaces. Moreover, the
                 library also implements methods that can be used to
                 perform dimension elevation, to subdivide B-curves and
                 B-surfaces by means of de Casteljau-like B-algorithms,
                 and to generate basis transformations for the
                 B-representation of arbitrary integral curves and
                 surfaces that are described in traditional parametric
                 form by means of the ordinary bases of the underlying
                 EC spaces. Independently of the algebraic, exponential,
                 trigonometric, or mixed type of the applied EC space,
                 the proposed library is numerically stable and
                 efficient up to a reasonable dimension number and may
                 be useful for academics and engineers in the fields of
                 Approximation Theory, Computer Aided Geometric Design,
                 Computer Graphics, and Isogeometric and Numerical
                 Analysis.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
}

@Article{Sabarimuthu:2019:ADC,
  author =       "J. M. Sabarimuthu and T. G. Venkatesh",
  title =        "Analytical Derivation of Concurrent Reuse Distance
                 Profile for Multi-Threaded Application Running on Chip
                 Multi-Processor",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "30",
  number =       "8",
  pages =        "1704--1721",
  month =        aug,
  year =         "2019",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2896633",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Fri Aug 30 06:09:58 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "analytical model; analytical model based reuse
                 distance prediction; Analytical models; cache memory
                 design space; cache performance; cache storage;
                 coherent reuse distance profile; compiler optimization;
                 Complexity theory; concurrent reuse distance;
                 concurrent reuse distance profile; Histograms;
                 Instruction sets; locality analysis; Markov chain;
                 Markov processes; Measurement; microprocessor chips;
                 multi-core processors; multi-threaded applications;
                 multi-threading; multicore simulator Sniper;
                 multiprocessing systems; multithreaded application;
                 optimisation; Performance analysis; performance
                 analysis; probability; probability theory; Reuse
                 distance profile; shared memory environment;
                 simulation; standalone reuse distance profile; thread
                 sharing",
}

@Book{Sengupta:2019:JHP,
  author =       "Avik Sengupta",
  title =        "{Julia} high performance optimizations, distributed
                 computing, multithreading, and {GPU} programming with
                 {Julia 1.0} and beyond",
  publisher =    pub-PACKT,
  address =      pub-PACKT:adr,
  edition =      "Second",
  pages =        "218",
  year =         "2019",
  ISBN =         "1-78829-230-8, 1-78829-811-X",
  ISBN-13 =      "978-1-78829-230-6, 978-1-78829-811-7",
  LCCN =         "????",
  bibdate =      "Thu Apr 8 16:49:31 MDT 2021",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 https://www.math.utah.edu/pub/tex/bib/julia.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://portal.igpublish.com/iglibrary/search/PACKT0005341.html",
  abstract =     "Julia is a high-level, high-performance dynamic
                 programming language for numerical computing. This book
                 will help you understand the performance
                 characteristics of your Julia programs and achieve
                 near-C levels of performance in Julia.",
  acknowledgement = ack-nhfb,
  subject =      "Julia (Computer program language); Application
                 software; Development; Development.; Julia (Computer
                 program language)",
  tableofcontents = "Foreword \\
                 Contributors \\
                 Table of Contents \\
                 Preface \\
                 1: Julia is Fast \\
                 Julia \\
                 fast and dynamic \\
                 Designed for speed \\
                 JIT and LLVM \\
                 Types, type inference, and code specialization \\
                 How fast can Julia be? \\
                 Summary \\
                 2: Analyzing Performance \\
                 Timing Julia functions \\
                 The @time macro \\
                 Other time macros \\
                 The Julia profiler \\
                 Using the profiler \\
                 ProfileView \\
                 Using Juno for profiling \\
                 Using TimerOutputs \\
                 Analyzing memory allocation \\
                 Using the memory allocation tracker \\
                 Statistically accurate benchmarking \\
                 Using \pkg{BenchmarkTools.jl} \\
                 Summary \\
                 3: Types, Type Inference, and Stability \\
                 The Julia type system \\
                 Using types \\
                 Multiple dispatch \\
                 Abstract types \\
                 Julia's type hierarchy \\
                 Composite and immutable types \\
                 Type parameters \\
                 Type inference \\
                 Type-stability \\
                 Definitions \\
                 Fixing type instability \\
                 The performance pitfalls \\
                 Identifying type stability \\
                 Loop variables \\
                 Kernel methods and function barriers \\
                 Types in storage locations \\
                 Arrays \\
                 Composite types \\
                 Parametric composite types \\
                 Summary \\
                 4: Making Fast Function Calls \\
                 Using globals \\
                 The trouble with globals \\
                 Fixing performance issues with globals \\
                 Inlining \\
                 Default inlining \\
                 Controlling inlining \\
                 Disabling inlining \\
                 Constant propagation \\
                 Using macros for performance \\
                 The Julia compilation process \\
                 Using macros \\
                 Evaluating a polynomial \\
                 Horner's method \\
                 The Horner macro \\
                 Generated functions \\
                 Using generated functions \\
                 Using generated functions for performance \\
                 Using keyword arguments \\
                 Summary \\
                 5: Fast Numbers \\
                 Numbers in Julia, their layout, and storage \\
                 Integers \\
                 Integer overflow \\
                 BigInt \\
                 The floating point \\
                 Floating point accuracy \\
                 Unsigned integers \\
                 Trading performance for accuracy \\
                 The @fastmath macro \\
                 The K-B-N summation \\
                 Subnormal numbers \\
                 Subnormal numbers to zero \\
                 Summary \\
                 6: Using Arrays \\
                 Array internals in Julia \\
                 Array representation and storage \\
                 Column-wise storage \\
                 Adjoints \\
                 Array initialization \\
                 Bounds checking \\
                 Removing the cost of bounds checking \\
                 Configuring bound checks at startup \\
                 Allocations and in-place operations \\
                 Preallocating function output \\
                 sizehint! \\
                 Mutating functions \\
                 Broadcasting \\
                 Array views \\
                 SIMD parallelization (AVX2, AVX512) \\
                 SIMD.jl \\
                 Specialized array types \\
                 Static arrays \\
                 Structs of arrays \\
                 Yeppp!Writing generic library functions with arrays \\
                 Summary \\
                 7: Accelerating Code with the GPU \\
                 Technical requirements \\
                 Getting started with GPUs \\
                 CUDA and Julia \\
                 CuArrays \\
                 Monte Carlo simulation on the GPU \\
                 Writing your own kernels \\
                 Measuring GPU performance \\
                 Performance tips \\
                 Scalar iteration \\
                 Combining kernels \\
                 Processing more data \\
                 Deep learning on the GPU \\
                 ArrayFire \\
                 Summary \\
                 8: Concurrent Programming with Tasks \\
                 Tasks \\
                 Using tasks \\
                 The task life cycle \\
                 task\_local\_storage \\
                 Communicating between tasks \\
                 Task iteration \\
                 High-performance I/O",
}

@Article{Shea:2019:HSD,
  author =       "Colin Shea and Tinoosh Mohsenin",
  title =        "Heterogeneous Scheduling of Deep Neural Networks for
                 Low-power Real-time Designs",
  journal =      j-JETC,
  volume =       "15",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358699",
  ISSN =         "1550-4832",
  bibdate =      "Tue Dec 17 07:50:24 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3358699",
  abstract =     "Deep neural networks have become the readiest answer
                 to a range of application challenges including image
                 recognition, stock analysis, natural language
                 processing, and biomedical applications such as seizure
                 detection. All while outperforming prior leading
                 solutions that relied heavily on hand-engineered
                 techniques. However, deployment of these neural
                 networks often requires high-computational and
                 memory-intensive solutions. These requirements make it
                 challenging to deploy Deep Neural Networks (DNNs) in
                 embedded, real-time low-power applications where
                 classic architectures, GPUs and CPUs, still impose
                 significant power burden. Systems-on-Chip (SoC) with
                 Field-programmable Gate Arrays (FPGAs) can be used to
                 improve performance and allow more fine-grain control
                 of resources than CPUs or GPUs, but it is difficult to
                 find the optimal balance between hardware and software
                 to improve DNN efficiency. In the current research
                 literature there have been few proposed solutions to
                 address optimizing hardware and software deployments of
                 DNNs in embedded low-power systems. To address the
                 computation resource restriction and low-power needs
                 for deploying these networks, we describe and implement
                 a domain-specific metric model for optimizing task
                 deployment on differing platforms, hardware and
                 software. Next, we propose a DNN hardware accelerator
                 called Scalable Low-power Accelerator for real-time
                 deep neural Networks (SCALENet) that includes
                 multithreaded software workers. Finally, we propose a
                 heterogeneous aware scheduler that uses the
                 DNN-specific metric models and the SCALENet accelerator
                 to allocate a task to a resource based on solving a
                 numerical cost for a series of domain objectives. To
                 demonstrate the applicability of our contribution, we
                 deploy nine modern deep network architectures, each
                 containing a different number of parameters within the
                 context of two different neural network applications:
                 image processing and biomedical seizure detection.
                 Utilizing the metric modeling techniques integrated
                 into the heterogeneous aware scheduler and the SCALENet
                 accelerator, we demonstrate the ability to meet
                 computational requirements, adapt to multiple
                 architectures, and lower power by providing an
                 optimized task to resource allocation. Our
                 heterogeneous aware scheduler improves power saving by
                 decreasing power consumption by 10\% of the total
                 system power, does not affect the accuracy of the
                 networks, and still meets the real-time deadlines. We
                 demonstrate the ability to achieve parity with or
                 exceed the energy efficiency of NVIDIA GPUs when
                 evaluated against Jetson TK1 with embedded GPU SoC and
                 with a 4$ \times $ power savings in a power envelope of
                 2.0W. When compared to existing FPGA-based
                 accelerators, SCALENet's accelerator and heterogeneous
                 aware scheduler achieves a 4$ \times $ improvement in
                 energy efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Shomron:2019:SSS,
  author =       "G. Shomron and T. Horowitz and U. Weiser",
  title =        "{SMT-SA}: Simultaneous Multithreading in Systolic
                 Arrays",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "99--102",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2924007",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Systolic arrays (SAs) are highly parallel pipelined
                 structures capable of executing various tasks such as
                 matrix multiplication and convolution. They comprise a
                 grid of usually homogeneous processing units (PUs) that
                 are responsible for the multiply-accumulate (MAC)
                 operations in the case of matrix multiplication. It is
                 not rare for a PU input to be zero-valued, in which
                 case the PU becomes idle and the array becomes
                 underutilized. In this paper we consider a solution to
                 employ the underutilized PUs via simultaneous
                 multithreading (SMT). We explore the design space of a
                 SMT-SA variant and evaluate its performance, area
                 efficiency, and energy consumption. In addition, we
                 suggest a tiling method to reduce area overheads. Our
                 evaluation shows that a 4-thread FP16-based SMT-SA
                 achieves speedups of up to $ 3.6 \times $ as compared
                 to conventional SA, with $ 1.7 \times $ area overhead
                 and negligible energy overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "4-thread FP16-based SMT-SA; area efficiency;
                 Convolution; Correlation; Deep learning; Energy
                 consumption; energy consumption; homogeneous processing
                 units; Instruction sets; matrix multiplication;
                 multi-threading; multiply-accumulate operations;
                 Multithreading; multithreading; parallel pipelined
                 structures; PU input; simultaneous multithreading;
                 SMT-SA variant; Systolic arrays; systolic arrays; Task
                 analysis",
}

@Article{Silva:2019:RFG,
  author =       "Lucas Bragan{\c{c}}a {Da Silva} and Ricardo Ferreira
                 and Michael Canesche and Marcelo M. Menezes and Maria
                 D. Vieira and Jeronimo Penha and Peter Jamieson and
                 Jos{\'e} Augusto M. Nacif",
  title =        "{READY}: a Fine-Grained Multithreading Overlay
                 Framework for Modern {CPU--FPGA} Dataflow
                 Applications",
  journal =      j-TECS,
  volume =       "18",
  number =       "5s",
  pages =        "56:1--56:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358187",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3358187",
  abstract =     "In this work, we propose a framework called
                 REconfigurable Accelerator DeploY (READY), the first
                 framework to support polynomial runtime mapping of
                 dataflow applications in high-performance CPU-FPGA
                 platforms. READY introduces an efficient mapping with
                 fine-grained multithreading onto an overlay
                 architecture that hides the latency of a global
                 interconnection network. In addition to our overlay
                 architecture, we show how this system helps solve some
                 of the challenges for FPGA cloud computing adoption in
                 high-performance computing. The framework encapsulates
                 dataflow descriptions by using a target independent,
                 high-level API, and a dataflow model that allows for
                 explicit spatial and temporal parallelism. READY
                 directly maps the dataflow kernels onto the
                 accelerator. Our tool is flexible and extensible and
                 provides the infrastructure to explore different
                 accelerator designs. We validate READY on the Intel
                 Harp platform, and our experimental results show an
                 average 2x execution runtime improvement when compared
                 to an 8-thread multi-core processor.",
  acknowledgement = ack-nhfb,
  articleno =    "56",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Spoto:2019:SII,
  author =       "Fausto Spoto and Elisa Burato and Michael D. Ernst and
                 Pietro Ferrara and Alberto Lovato and Damiano Macedonio
                 and Ciprian Spiridon",
  title =        "Static Identification of Injection Attacks in {Java}",
  journal =      j-TOPLAS,
  volume =       "41",
  number =       "3",
  pages =        "18:1--18:??",
  month =        jul,
  year =         "2019",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/3332371",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Sat Nov 23 07:18:02 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3332371",
  abstract =     "The most dangerous security-related software errors,
                 according to the OWASP Top Ten 2017 list, affect web
                 applications. They are potential injection attacks that
                 exploit user-provided data to execute undesired
                 operations: database access and updates ( SQL injection
                 ); generation of malicious web pages ( cross-site
                 scripting injection ); redirection to user-specified
                 web pages ( redirect injection ); execution of OS
                 commands and arbitrary scripts ( command injection );
                 loading of user-specified, possibly heavy or dangerous
                 classes at run time ( reflection injection ); access to
                 arbitrary files on the file system ( path-traversal );
                 and storing user-provided data into heap regions
                 normally assumed to be shielded from the outside world
                 ( trust boundary violation ). All these attacks exploit
                 the same weakness: unconstrained propagation of data
                 from sources that the user of a web application
                 controls into sinks whose activation might trigger
                 dangerous operations. Although web applications are
                 written in a variety of languages, Java remains a
                 frequent choice, in particular for banking
                 applications, where security has tangible relevance.
                 This article defines a unified, sound protection
                 mechanism against such attacks, based on the
                 identification of all possible explicit flows of
                 tainted data in Java code. Such flows can be
                 arbitrarily complex, passing through dynamically
                 allocated data structures in the heap. The analysis is
                 based on abstract interpretation and is
                 interprocedural, flow-sensitive, and context-sensitive.
                 Its notion of taint applies to reference
                 (non-primitive) types dynamically allocated in the heap
                 and is object-sensitive and field-sensitive. The
                 analysis works by translating the program into Boolean
                 formulas that model all possible data flows. Its
                 implementation, within the Julia analyzer for Java and
                 Android, found injection security vulnerabilities in
                 the Internet banking service and in the customer
                 relationship management of large Italian banks, as well
                 as in a set of open-source third-party applications. It
                 found the command injection, which is at the origin of
                 the 2017 Equifax data breach, one of the worst data
                 breaches ever. For objective, repeatable results, this
                 article also evaluates the implementation on two
                 open-source security benchmarks: the Juliet Suite and
                 the OWASP Benchmark for the automatic comparison of
                 static analyzers for cybersecurity. We compared this
                 technique against more than 10 other static analyzers,
                 both free and commercial. The result of these
                 experiments is that ours is the only analysis for
                 injection that is sound (up to well-stated limitations
                 such as multithreading and native code) and works on
                 industrial code, and it is also much more precise than
                 other tools.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J783",
}

@Article{Storey:2019:SDP,
  author =       "Kyle Storey and Eric Mercer and Pavel Parizek",
  title =        "A Sound Dynamic Partial Order Reduction Engine for
                 {Java Pathfinder}",
  journal =      j-SIGSOFT,
  volume =       "44",
  number =       "4",
  pages =        "15--15",
  month =        dec,
  year =         "2019",
  CODEN =        "SFENDP",
  DOI =          "https://doi.org/10.1145/3364452.3364457",
  ISSN =         "0163-5948 (print), 1943-5843 (electronic)",
  ISSN-L =       "0163-5948",
  bibdate =      "Wed Mar 24 14:07:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3364452.3364457",
  abstract =     "When model checking a multi-threaded program, it is
                 often necessary to enumerate the possible ordering of
                 concurrent events to evaluate the behavior of the
                 program. However, enumerating every possible order of
                 events quickly leads to state-space \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGSOFT Software Engineering Notes",
  journal-URL =  "https://dl.acm.org/loi/sigsoft",
}

@Article{Su:2019:SSC,
  author =       "Xing Su and Xiangke Liao and Hao Jiang and Canqun Yang
                 and Jingling Xue",
  title =        "{SCP}: Shared Cache Partitioning for High-Performance
                 {GEMM}",
  journal =      j-TACO,
  volume =       "15",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3274654",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jan 8 17:20:00 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3274654",
  abstract =     "GEneral Matrix Multiply (GEMM) is the most fundamental
                 computational kernel routine in the BLAS library. To
                 achieve high performance, in-memory data must be
                 prefetched into fast on-chip caches before they are
                 used. Two techniques, software prefetching and data
                 packing, have been used to effectively exploit the
                 capability of on-chip least recent used (LRU) caches,
                 which are popular in traditional high-performance
                 processors used in high-end servers and supercomputers.
                 However, the market has recently witnessed a new
                 diversity in processor design, resulting in
                 high-performance processors equipped with shared caches
                 with non-LRU replacement policies. This poses a
                 challenge to the development of high-performance GEMM
                 in a multithreaded context. As several threads try to
                 load data into a shared cache simultaneously,
                 interthread cache conflicts will increase
                 significantly. We present a Shared Cache Partitioning
                 (SCP) method to eliminate interthread cache conflicts
                 in the GEMM routines, by partitioning a shared cache
                 into physically disjoint sets and assigning different
                 sets to different threads. We have implemented SCP in
                 the OpenBLAS library and evaluated it on Phytium 2000+,
                 a 64-core AArch64 processor with private LRU L1 caches
                 and shared pseudo-random L2 caches (per four-core
                 cluster). Our evaluation shows that SCP has effectively
                 reduced the conflict misses in both L1 and L2 caches in
                 a highly optimized GEMM implementation, resulting in an
                 improvement of its performance by 2.75\% to 6.91\%.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Utterback:2019:POR,
  author =       "Robert Utterback and Kunal Agrawal and I-Ting Angelina
                 Lee and Milind Kulkarni",
  title =        "Processor-Oblivious Record and Replay",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365659",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3365659",
  abstract =     "Record-and-replay systems are useful tools for
                 debugging non-deterministic parallel programs by first
                 recording an execution and then replaying that
                 execution to produce the same access pattern. Existing
                 record-and-replay systems generally target thread-based
                 execution models, and record the behaviors and
                 interleavings of individual threads. Dynamic
                 multithreaded languages and libraries, such as the Cilk
                 family, OpenMP, TBB, and the like, do not have a notion
                 of threads. Instead, these languages provide a
                 processor-oblivious model of programming, where
                 programs expose task parallelism using high-level
                 constructs such as spawn/sync without regard to the
                 number of threads/cores available to run the program.
                 Thread-based record-and-replay would violate the
                 processor-oblivious nature of these programs, as they
                 incorporate the number of threads into the recorded
                 information, constraining the replayed execution to the
                 same number of threads. In this article, we present a
                 processor-oblivious record-and-replay scheme for
                 dynamic multithreaded languages where record and replay
                 can use different number of processors and both are
                 scheduled using work stealing. We provide theoretical
                 guarantees for our record and replay scheme-namely that
                 record is optimal for programs with one lock and replay
                 is near-optimal for all cases. In addition, we
                 implemented this scheme in the Cilk Plus runtime system
                 and our evaluation indicates that
                 processor-obliviousness does not cause substantial
                 overheads.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Venkataramani:2019:SMM,
  author =       "Vanchinathan Venkataramani and Mun Choon Chan and
                 Tulika Mitra",
  title =        "Scratchpad-Memory Management for Multi-Threaded
                 Applications on Many-Core Architectures",
  journal =      j-TECS,
  volume =       "18",
  number =       "1",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301308",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:42 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301308",
  abstract =     "Contemporary many-core architectures, such as Adapteva
                 Epiphany and Sunway TaihuLight, employ per-core
                 software-controlled Scratchpad Memory (SPM) rather than
                 caches for better performance-per-watt and
                 predictability. In these architectures, a core is
                 allowed to access its own SPM as well as remote SPMs
                 through the Network-On-Chip (NoC). However, the
                 compiler/programmer is required to explicitly manage
                 the movement of data between SPMs and off-chip memory.
                 Utilizing SPMs for multi-threaded applications is even
                 more challenging, as the shared variables across the
                 threads need to be placed appropriately. Accessing
                 variables from remote SPMs with higher access latency
                 further complicates this problem as certain links in
                 the NoC may be heavily contended by multiple threads.
                 Therefore, certain variables may need to be replicated
                 in multiple SPMs to reduce the contention delay and/or
                 the overall access time. We present Coordinated Data
                 Management (CDM), a compile-time framework that
                 automatically identifies shared/private variables and
                 places them with replication (if necessary) to suitable
                 on-chip or off-chip memory, taking NoC contention into
                 consideration. We develop both an exact Integer Linear
                 Programming (ILP) formulation as well as an iterative,
                 scalable algorithm for placing the data variables in
                 multi-threaded applications on many-core SPMs.
                 Experimental evaluation on the Parallella hardware
                 platform confirms that our allocation strategy reduces
                 the overall execution time and energy consumption by $
                 1.84 \times $ and $ 1.83 \times $, respectively, when
                 compared to the existing approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Wang:2019:MEM,
  author =       "L. Wang and M. Jahre and A. Adileh and Z. Wang and L.
                 Eeckhout",
  title =        "Modeling Emerging Memory-Divergent {GPU}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "95--98",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2923618",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Analytical performance models yield valuable
                 architectural insight without incurring the excessive
                 runtime overheads of simulation. In this work, we study
                 contemporary GPU applications and find that the key
                 performance-related behavior of such applications is
                 distinct from traditional GPU applications. The key
                 issue is that these GPU applications are
                 memory-intensive and have poor spatial locality, which
                 implies that the loads of different threads commonly
                 access different cache blocks. Such memory-divergent
                 applications quickly exhaust the number of misses the
                 L1 cache can process concurrently, and thereby cripple
                 the GPU's ability to use Memory-Level Parallelism (MLP)
                 and Thread-Level Parallelism (TLP) to hide memory
                 latencies. Our Memory Divergence Model (MDM) is able to
                 accurately represent this behavior and thereby reduces
                 average performance prediction error by $ 14 \times $
                 compared to the state-of-the-art GPUMech approach
                 across our memory-divergent applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; analytical performance models;
                 Analytical performance prediction; average performance
                 prediction error; cache blocks; cache storage;
                 Computational modeling; contemporary GPU applications;
                 GPU; graphics processing units; Graphics processing
                 units; Instruction sets; key performance-related
                 behavior; L1 cache; Mathematical model; memory
                 architecture; memory divergence model; memory
                 latencies; memory-divergent applications;
                 memory-divergent GPU applications; memory-intensive;
                 memory-level parallelism; multi-threading;
                 multiprocessing systems; Predictive models; Random
                 access memory; thread-level parallelism; traditional
                 GPU applications; valuable architectural insight",
}

@Article{Wang:2019:SSS,
  author =       "Wenlu Wang and Ji Zhang and Min-Te Sun and Wei-Shinn
                 Ku",
  title =        "A scalable spatial skyline evaluation system utilizing
                 parallel independent region groups",
  journal =      j-VLDB-J,
  volume =       "28",
  number =       "1",
  pages =        "73--98",
  month =        feb,
  year =         "2019",
  CODEN =        "VLDBFR",
  DOI =          "https://doi.org/10.1007/s00778-018-0519-4",
  ISSN =         "1066-8888 (print), 0949-877X (electronic)",
  ISSN-L =       "1066-8888",
  bibdate =      "Tue Feb 5 08:07:20 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbj.bib",
  abstract =     "This research presents two parallel solutions to
                 efficiently address spatial skyline queries. First, we
                 propose a novel concept called independent regions for
                 parallelizing the process of spatial skyline
                 evaluation. Spatial skyline candidates in an
                 independent region do not depend on any data point in
                 other independent regions. Then, we propose a GPU-based
                 solution. We use multi-level independent region
                 group-based parallel filter to support efficient
                 multi-threading spatial skyline non-candidate
                 elimination. Beyond that, we propose comparable region
                 to accelerate non-candidate elimination in each
                 independent region. Secondly, we propose a
                 MapReduce-based solution. We generate the convex hull
                 of query points in the first MapReduce phase. In the
                 second phase, we calculate independent regions based on
                 the input data points and the convex hull of the query
                 points. With the independent regions, spatial skylines
                 are evaluated in parallel in the third phase, in which
                 data points are partitioned by their associated
                 independent regions in map functions, and spatial
                 skyline candidates are calculated by reduce functions.
                 The results of the spatial skyline queries are the
                 union of outputs from the reduce functions. Our
                 experimental results show that GPU multi-threading
                 scheme is very efficient on small-scale input datasets.
                 On the contrary, MapReduce scheme performs very well on
                 large-scale input datasets.",
  acknowledgement = ack-nhfb,
  fjournal =     "VLDB Journal: Very Large Data Bases",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J869",
}

@Article{Watt:2019:WW,
  author =       "Conrad Watt and Andreas Rossberg and Jean
                 Pichon-Pharabod",
  title =        "Weakening {WebAssembly}",
  journal =      j-PACMPL,
  volume =       "3",
  number =       "OOPSLA",
  pages =        "133:1--133:28",
  month =        oct,
  year =         "2019",
  DOI =          "https://doi.org/10.1145/3360559",
  bibdate =      "Fri Aug 7 19:22:30 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360559",
  abstract =     "WebAssembly (Wasm) is a safe, portable virtual
                 instruction set that can be hosted in a wide range of
                 environments, such as a Web browser. It is a low-level
                 language whose instructions are intended to compile
                 directly to bare hardware. While the initial version of
                 Wasm focussed on single-threaded computation, a recent
                 proposal extends it with low-level support for multiple
                 threads and atomic instructions for synchronised access
                 to shared memory. To support the correct compilation of
                 concurrent programs, it is necessary to give a suitable
                 specification of its memory model.\par

                 Wasm's language definition is based on a fully
                 formalised specification that carefully avoids
                 undefined behaviour. We present a substantial extension
                 to this semantics, incorporating a relaxed memory
                 model, along with a few proposed extensions. Wasm's
                 memory model is unique in that its linear address space
                 can be dynamically grown during execution, while all
                 accesses are bounds-checked. This leads to the novel
                 problem of specifying how observations about the size
                 of the memory can propagate between threads. We argue
                 that, considering desirable compilation schemes, we
                 cannot give a sequentially consistent semantics to
                 memory growth.\par

                 We show that our model provides sequential consistency
                 for data-race-free executions (SC-DRF). However,
                 because Wasm is to run on the Web, we must also
                 consider interoperability of its model with that of
                 JavaScript. We show, by counter-example, that
                 JavaScript's memory model is not SC-DRF, in contrast to
                 what is claimed in its specification. We propose two
                 axiomatic conditions that should be added to the
                 JavaScript model to correct this difference.\par

                 We also describe a prototype SMT-based litmus tool
                 which acts as an oracle for our axiomatic model,
                 visualising its behaviours, including memory
                 resizing.",
  acknowledgement = ack-nhfb,
  articleno =    "133",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Wu:2019:HUI,
  author =       "Jimmy Ming-Tai Wu and Jerry Chun-Wei Lin and Ashish
                 Tamrakar",
  title =        "High-Utility Itemset Mining with Effective Pruning
                 Strategies",
  journal =      j-TKDD,
  volume =       "13",
  number =       "6",
  pages =        "58:1--58:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3363571",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Dec 18 14:31:03 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3363571",
  abstract =     "High-utility itemset mining is a popular data mining
                 problem that considers utility factors, such as
                 quantity and unit profit of items besides frequency
                 measure from the transactional database. It helps to
                 find the most valuable and profitable products/items
                 that are difficult to track by using only the frequent
                 itemsets. An item might have a high-profit value which
                 is rare in the transactional database and has a
                 tremendous importance. While there are many existing
                 algorithms to find high-utility itemsets (HUIs) that
                 generate comparatively large candidate sets, our main
                 focus is on significantly reducing the computation time
                 with the introduction of new pruning strategies. The
                 designed pruning strategies help to reduce the
                 visitation of unnecessary nodes in the search space,
                 which reduces the time required by the algorithm. In
                 this article, two new stricter upper bounds are
                 designed to reduce the computation time by refraining
                 from visiting unnecessary nodes of an itemset. Thus,
                 the search space of the potential HUIs can be greatly
                 reduced, and the mining procedure of the execution time
                 can be improved. The proposed strategies can also
                 significantly minimize the transaction database
                 generated on each node. Experimental results showed
                 that the designed algorithm with two pruning strategies
                 outperform the state-of-the-art algorithms for mining
                 the required HUIs in terms of runtime and number of
                 revised candidates. The memory usage of the designed
                 algorithm also outperforms the state-of-the-art
                 approach. Moreover, a multi-thread concept is also
                 discussed to further handle the problem of big
                 datasets.",
  acknowledgement = ack-nhfb,
  articleno =    "58",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Yeh:2019:PGR,
  author =       "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
                 Rudolf Eigenmann and Timothy G. Rogers",
  title =        "{Pagoda}: a {GPU} Runtime System for Narrow Tasks",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365657",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Massively multithreaded GPUs achieve high throughput
                 by running thousands of threads in parallel. To fully
                 utilize the their hardware, contemporary workloads
                 spawn work to the GPU in bulk by launching large tasks,
                 where each task is a kernel that contains thousands of
                 threads that occupy the entire GPU. GPUs face severe
                 underutilization and their performance benefits vanish
                 if the tasks are narrow, i.e., they contain less than
                 512 threads. Latency-sensitive applications in network,
                 signal, and image processing that generate a large
                 number of tasks with relatively small inputs are
                 examples of such limited parallelism. This article
                 presents Pagoda, a runtime system that virtualizes GPU
                 resources, using an OS-like daemon kernel called
                 MasterKernel. Tasks are spawned from the CPU onto
                 Pagoda as they become available, and are scheduled by
                 the MasterKernel at the warp granularity. This level of
                 control enables the GPU to keep scheduling and
                 executing tasks as long as free warps are found,
                 dramatically reducing underutilization. Experimental
                 results on real hardware demonstrate that Pagoda
                 achieves a geometric mean speedup of 5.52X over
                 PThreads running on a 20-core CPU, 1.76X over
                 CUDA-HyperQ, and 1.44X over GeMTC, the state-of-the-art
                 runtime GPU task scheduling system.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Zhong:2019:SHS,
  author =       "Guanwen Zhong and Akshat Dubey and Cheng Tan and
                 Tulika Mitra",
  title =        "{Synergy}: an {HW\slash SW} Framework for High
                 Throughput {CNNs} on Embedded Heterogeneous {SoC}",
  journal =      j-TECS,
  volume =       "18",
  number =       "2",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301278",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Thu Oct 17 18:16:43 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tecs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301278",
  abstract =     "Convolutional Neural Networks (CNN) have been widely
                 deployed in diverse application domains. There has been
                 significant progress in accelerating both their
                 training and inference using high-performance GPUs,
                 FPGAs, and custom ASICs for datacenter-scale
                 environments. The recent proliferation of mobile and
                 Internet of Things (IoT) devices have necessitated
                 real-time, energy-efficient deep neural network
                 inference on embedded-class, resource-constrained
                 platforms. In this context, we present Synergy, an
                 automated, hardware-software co-designed, pipelined,
                 high-throughput CNN inference framework on embedded
                 heterogeneous system-on-chip (SoC) architectures
                 (Xilinx Zynq). Synergy leverages, through
                 multi-threading, all the available on-chip resources,
                 which includes the dual-core ARM processor along with
                 the FPGA and the NEON Single-Instruction Multiple-Data
                 (SIMD) engines as accelerators. Moreover, Synergy
                 provides a unified abstraction of the heterogeneous
                 accelerators (FPGA and NEON) and can adapt to different
                 network configurations at runtime without changing the
                 underlying hardware accelerator architecture by
                 balancing workload across accelerators through
                 work-stealing. Synergy achieves 7.3X speedup, averaged
                 across seven CNN models, over a well-optimized
                 software-only solution. Synergy demonstrates
                 substantially better throughput and energy-efficiency
                 compared to the contemporary CNN implementations on the
                 same SoC architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J840",
}

@Article{Zois:2019:EMM,
  author =       "Vasileios Zois and Vassilis J. Tsotras and Walid A.
                 Najjar",
  title =        "Efficient main-memory top-$k$ selection for multicore
                 architectures",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "13",
  number =       "2",
  pages =        "114--127",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3364324.3364327",
  ISSN =         "2150-8097",
  bibdate =      "Wed Dec 11 07:51:12 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  abstract =     "Efficient Top-$k$ query evaluation relies on practices
                 that utilize auxiliary data structures to enable early
                 termination. Such techniques were designed to trade-off
                 complex work in the buffer pool against costly access
                 to disk-resident data. Parallel in-memory Top-$k$
                 selection with support for early termination presents a
                 novel challenge because computation shifts higher up in
                 the memory hierarchy. In this environment, data scan
                 methods using SIMD instructions and multithreading
                 perform well despite requiring evaluation of the
                 complete dataset. Early termination schemes that favor
                 simplicity require random access to resolve score
                 ambiguity while those optimized for sequential access
                 incur too many object evaluations. In this work, we
                 introduce the concept of rank uncertainty, a measure of
                 work efficiency that enables classifying existing
                 solutions according to their potential for efficient
                 parallel in-memory Top-fc selection. We identify data
                 reordering and layering strategies as those having the
                 highest potential and provide practical guidelines on
                 how to adapt them for parallel in-memory execution
                 (creating the VTA and SLA approaches). In addition, we
                 show that the number of object evaluations can be
                 further decreased by combining data reordering with
                 angle space partitioning (introducing PTA). Our
                 extensive experimental evaluation on varying query
                 parameters using both synthetic and real data, showcase
                 that PTA exhibits between 2 and 4 orders of magnitude
                 better query latency, and throughput when compared to
                 prior work and our optimized algorithmic variants (i.e.
                 VTA, SLA).",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "http://portal.acm.org/citation.cfm?id=J1174",
}

@Article{Algosaibi:2020:PBT,
  author =       "Abdulelah Algosaibi and Khaled Ragab and Saleh
                 Albahli",
  title =        "Parallel-Based Techniques for Managing and Analyzing
                 the Performance on Semantic Graph",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "30",
  number =       "02",
  pages =        "??--??",
  month =        jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1142/S0129626420500073",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Mon Mar 29 12:30:13 MDT 2021",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626420500073",
  abstract =     "In recent years, data are generated rapidly that
                 advanced the evolving of the linked data. Modern data
                 are globally distributed over the semantically linked
                 graphs. The nature of the distributed data over the
                 semantic graph raised new demands on further
                 investigation on improving performance on the semantic
                 graphs. In this work, we analyzed the time latency as
                 an important factor to be further investigated and
                 improved. We evaluated the parallel computing on these
                 distributed data in order to better utilize the
                 parallelism approaches. A federation framework based on
                 a multi-threaded environment supporting federated
                 SPARQL query was introduced. In our experiments, we
                 show the achievability and effectiveness of our model
                 on a set of real-world quires through real-world Linked
                 Open Data cloud. Significant performance improvement
                 has noticed. Further, we highlight short-comings that
                 could open an avenue in the research of federated
                 queries. Keywords: Semantic web; distributed query
                 processing; query federation; linked data; join
                 methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Bagherzadeh:2020:ACB,
  author =       "Mehdi Bagherzadeh and Nicholas Fireman and Anas
                 Shawesh and Raffi Khatchadourian",
  title =        "Actor concurrency bugs: a comprehensive study on
                 symptoms, root causes, {API} usages, and differences",
  journal =      j-PACMPL,
  volume =       "4",
  number =       "OOPSLA",
  pages =        "214:1--214:32",
  month =        nov,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3428282",
  bibdate =      "Tue Mar 30 08:10:50 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3428282",
  abstract =     "Actor concurrency is becoming increasingly important
                 in the development of real-world software systems.
                 Although actor concurrency may be less susceptible to
                 some multithreaded concurrency bugs, such as low-level
                 data races and deadlocks, it comes with \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "214",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@InProceedings{Barros:2020:ALS,
  author =       "D. A. Barros and C. Bentes",
  booktitle =    "{2020 IEEE 32nd International Symposium on Computer
                 Architecture and High Performance Computing
                 (SBAC-PAD)}",
  title =        "Analyzing the Loop Scheduling Mechanisms on {Julia}
                 Multithreading",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "257--264",
  year =         "2020",
  DOI =          "https://doi.org/10.1109/SBAC-PAD49847.2020.00043",
  bibdate =      "Thu Apr 8 07:17:08 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/julia.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Julia programming language",
}

@Article{Castello:2020:ATL,
  author =       "A. Castell{\'o} and R. M. Gual and S. Seo and P.
                 Balaji and E. S. Quintana-Ort{\'\i} and A. J.
                 Pe{\~n}a",
  title =        "Analysis of Threading Libraries for High Performance
                 Computing",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "69",
  number =       "9",
  pages =        "1279--1292",
  month =        sep,
  year =         "2020",
  CODEN =        "ITCOB4",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed Aug 12 14:58:16 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Criswell:2020:SPC,
  author =       "K. Criswell and T. Adegbija",
  title =        "A Survey of Phase Classification Techniques for
                 Characterizing Variable Application Behavior",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "1",
  pages =        "224--236",
  month =        jan,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2929781",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Dec 19 09:20:35 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "adaptable computing; Big Data; big data; Clocks;
                 Computational modeling; dynamic optimization; edge
                 computing; emerging applications; Hardware; Multicore
                 processing; multithreaded applications; Optimization;
                 Phase classification; Runtime; variable program
                 behavior; workload characterization",
}

@Article{Cugu:2020:PMS,
  author =       "Ilke {\c{C}}ugu and Murat Manguoglu",
  title =        "A parallel multithreaded sparse triangular linear
                 system solver",
  journal =      j-COMPUT-MATH-APPL,
  volume =       "80",
  number =       "2",
  pages =        "371--385",
  month =        jul,
  year =         "2020",
  CODEN =        "CMAPDK",
  DOI =          "https://doi.org/10.1016/j.camwa.2019.09.012",
  ISSN =         "0898-1221 (print), 1873-7668 (electronic)",
  ISSN-L =       "0898-1221",
  bibdate =      "Wed Jul 8 08:11:16 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computmathappl2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0898122119304602",
  acknowledgement = ack-nhfb,
  fjournal =     "Computers and Mathematics with Applications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/08981221",
}

@Article{Dosanjh:2020:TQM,
  author =       "Matthew G. F. Dosanjh and Ryan E. Grant and Whit
                 Schonbein and Patrick G. Bridges",
  title =        "Tail queues: a multi-threaded matching architecture",
  journal =      j-CCPE,
  volume =       "32",
  number =       "3",
  pages =        "e5158:1--e5158:??",
  day =          "10",
  month =        feb,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5158",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:13 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "06 February 2019",
}

@Article{Feliu:2020:TII,
  author =       "J. Feliu and J. Sahuquillo and S. Petit and L.
                 Eeckhout",
  title =        "Thread Isolation to Improve Symbiotic Scheduling on
                 {SMT} Multicore Processors",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "2",
  pages =        "359--373",
  month =        feb,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2934955",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Wed Jan 22 06:09:50 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "http://www.computer.org/portal/web/csdl/transactions/tpds",
  keywords =     "Degradation; Message systems; Program processors;
                 Resource management; Schedules; Simultaneous
                 multithreading (SMT); Symbiosis; symbiotic job
                 scheduling; thread isolation; Throughput",
}

@Article{Fezzardi:2020:ABD,
  author =       "Pietro Fezzardi and Fabrizio Ferrandi",
  title =        "Automated Bug Detection for High-level Synthesis of
                 Multi-threaded Irregular Applications",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "27:1--27:26",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418086",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418086",
  abstract =     "Field Programmable Gate Arrays (FPGAs) are becoming an
                 appealing technology in datacenters and High
                 Performance Computing. High-Level Synthesis (HLS) of
                 multi-threaded parallel programs is increasingly used
                 to extract parallelism. Despite great leaps \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Ghorbani:2020:RDT,
  author =       "Mehrdad Ghorbani and Seyed Morteza Babamir",
  title =        "Runtime deadlock tracking and prevention of concurrent
                 multithreaded programs: a learning-based approach",
  journal =      j-CCPE,
  volume =       "32",
  number =       "10",
  pages =        "e5324:1--e5324:??",
  day =          "25",
  month =        may,
  year =         "2020",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.5324",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Wed Mar 31 07:52:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurr. Comput.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "09 May 2019",
}

@Article{Hickey:2020:HC,
  author =       "Rich Hickey",
  title =        "A history of {Clojure}",
  journal =      j-PACMPL,
  volume =       "4",
  number =       "HOPL",
  pages =        "71:1--71:46",
  month =        jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3386321",
  bibdate =      "Fri Aug 7 17:39:13 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3386321",
  abstract =     "Clojure was designed to be a general-purpose,
                 practical functional language, suitable for use by
                 professionals wherever its host language, e.g., Java,
                 would be. Initially designed in 2005 and released in
                 2007, Clojure is a dialect of Lisp, but is not a direct
                 descendant of any prior Lisp. It complements
                 programming with pure functions of immutable data with
                 concurrency-safe state management constructs that
                 support writing correct multithreaded programs without
                 the complexity of mutex locks.\par

                 Clojure is intentionally hosted, in that it compiles to
                 and runs on the runtime of another language, such as
                 the JVM. This is more than an implementation strategy;
                 numerous features ensure that programs written in
                 Clojure can leverage and interoperate with the
                 libraries of the host language directly and
                 efficiently.\par

                 In spite of combining two (at the time) rather
                 unpopular ideas, functional programming and Lisp,
                 Clojure has since seen adoption in industries as
                 diverse as finance, climate science, retail, databases,
                 analytics, publishing, healthcare, advertising and
                 genomics, and by consultancies and startups worldwide,
                 much to the career-altering surprise of its
                 author.\par

                 Most of the ideas in Clojure were not novel, but their
                 combination puts Clojure in a unique spot in language
                 design (functional, hosted, Lisp). This paper recounts
                 the motivation behind the initial development of
                 Clojure and the rationale for various design decisions
                 and language constructs. It then covers its evolution
                 subsequent to release and adoption.",
  acknowledgement = ack-nhfb,
  articleno =    "71",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Im:2020:DWF,
  author =       "Sungjin Im and Benjamin Moseley and Kamesh Munagala
                 and Kirk Pruhs",
  title =        "Dynamic Weighted Fairness with Minimal Disruptions",
  journal =      j-POMACS,
  volume =       "4",
  number =       "1",
  pages =        "19:1--19:18",
  month =        may,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3379485",
  ISSN =         "2476-1249",
  ISSN-L =       "2476-1249",
  bibdate =      "Mon Mar 29 10:31:33 MDT 2021",
  bibsource =    "http://portal.acm.org/https://www.math.utah.edu/pub/tex/bib/pomacs.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3379485",
  abstract =     "In this paper, we consider the following dynamic fair
                 allocation problem: Given a sequence of job arrivals
                 and departures, the goal is to maintain an
                 approximately fair allocation of the resource against a
                 target fair allocation policy, while minimizing he
                 total number of disruptions, which is the number of
                 times the allocation of any job is changed. We consider
                 a rich class of fair allocation policies that
                 significantly generalize those considered in previous
                 work. We first consider the models where jobs only
                 arrive, or jobs only depart. We present tight upper and
                 lower bounds for the number of disruptions required to
                 maintain a constant approximate fair allocation every
                 time step. In particular, for the canonical case where
                 jobs have weights and the resource allocation is
                 proportional to the job's weight, we show that
                 maintaining a constant approximate fair allocation
                 requires $ \Theta (\log^* n) $ disruptions per job,
                 almost matching the bounds in prior work for the unit
                 weight case. For the more general setting where the
                 allocation policy only decreases the allocation to a
                 job when new jobs arrive, we show that maintaining a
                 constant approximate fair allocation requires $ \Thta
                 (\log n) $ disruptions per job. We then consider the
                 model where jobs can both arrive and depart. We first
                 show strong lower bounds on the number of disruptions
                 required to maintain constant approximate fairness for
                 arbitrary instances. In contrast we then show that
                 there there is an algorithm that can maintain constant
                 approximate fairness with $ O(1) $ expected disruptions
                 per job if the weights of the jobs are independent of
                 the jobs arrival and departure order. We finally show
                 how our results can be extended to the setting with
                 multiple resources.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "Proceedings of the ACM on Measurement and Analysis of
                 Computing Systems (POMACS)",
  journal-URL =  "https://dl.acm.org/loi/pomacs",
}

@Article{Langr:2020:RII,
  author =       "Daniel Langr and Marin Ko{\v{c}}i{\v{c}}ka",
  title =        "Reducing the Impact of Intensive Dynamic Memory
                 Allocations in Parallel Multi-Threaded Programs",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "31",
  number =       "5",
  pages =        "1152--1164",
  month =        may,
  year =         "2020",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2019.2960514",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Feb 20 10:08:58 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
  keywords =     "Dynamic memory allocation; memory pooling;
                 multi-threading; parallel program; scalable heap
                 implementation; shared memory; small buffer
                 optimization",
}

@Article{Li:2020:MMT,
  author =       "Tao Li and Xiankai Zhang and Feng Luo and Fang-Xiang
                 Wu and Jianxin Wang",
  title =        "{MultiMotifMaker}: a Multi-Thread Tool for Identifying
                 {DNA} Methylation Motifs from {Pacbio} Reads",
  journal =      j-TCBB,
  volume =       "17",
  number =       "1",
  pages =        "220--225",
  month =        jan,
  year =         "2020",
  CODEN =        "ITCBCY",
  DOI =          "https://doi.org/10.1109/TCBB.2018.2861399",
  ISSN =         "1545-5963 (print), 1557-9964 (electronic)",
  ISSN-L =       "1545-5963",
  bibdate =      "Wed Jun 10 07:29:48 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tcbb.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1109/TCBB.2018.2861399",
  abstract =     "The methylation of DNA is an important mechanism to
                 control biological processes. Recently, the Pacbio SMRT
                 technology provides a new way to identify base
                 methylation in the genome. MotifMaker is a tool
                 developed by Pacbio for discovering DNA methylation
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE/ACM Transactions on Computational Biology and
                 Bioinformatics",
  journal-URL =  "https://dl.acm.org/loi/tcbb",
}

@Article{Puche:2020:ECF,
  author =       "Jos{\'e} Puche and Salvador Petit and Mar{\'\i}a E.
                 G{\'o}mez and Julio Sahuquillo",
  title =        "An efficient cache flat storage organization for
                 multithreaded workloads for low power processors",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "110",
  number =       "??",
  pages =        "1037--1054",
  month =        sep,
  year =         "2020",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2019.11.024",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Fri Jun 19 07:44:19 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X1930384X",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Tino:2020:SXE,
  author =       "Anita Tino and Caroline Collange and Andr{\'e}
                 Seznec",
  title =        "{SIMT-X}: Extending Single-Instruction Multi-Threading
                 to Out-of-Order Cores",
  journal =      j-TACO,
  volume =       "17",
  number =       "2",
  pages =        "15:1--15:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3392032",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jun 27 12:06:50 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3392032",
  abstract =     "This work introduces Single Instruction Multi-Thread
                 Express (SIMT-X), a general-purpose Central Processing
                 Unit (CPU) microarchitecture that enables Graphics
                 Processing Units (GPUs)-style SIMT execution across
                 multiple threads of the same program for high
                 throughput, while retaining the latency benefits of
                 out-of-order execution, and the programming convenience
                 of homogeneous multi-thread processors. SIMT-X
                 leverages the existing Single Instruction Multiple Data
                 (SIMD) back-end to provide CPU/GPU-like processing on a
                 single core with minimal overhead. We demonstrate that
                 although SIMT-X invokes a restricted form of
                 Out-of-Order (OoO), the microarchitecture successfully
                 captures a majority of the benefits of aggressive OoO
                 execution using at most two concurrent register
                 mappings per architectural register, while addressing
                 issues of partial dependencies and supporting a
                 general-purpose Instruction Set Architecture (ISA).",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Wenjie:2020:APW,
  author =       "Tang Wenjie and Yao Yiping and Li Tianlin and Song
                 Xiao and Zhu Feng",
  title =        "An Adaptive Persistence and Work-stealing Combined
                 Algorithm for Load Balancing on Parallel Discrete Event
                 Simulation",
  journal =      j-TOMACS,
  volume =       "30",
  number =       "2",
  pages =        "12:1--12:26",
  month =        apr,
  year =         "2020",
  CODEN =        "ATMCEZ",
  DOI =          "https://doi.org/10.1145/3364218",
  ISSN =         "1049-3301 (print), 1558-1195 (electronic)",
  ISSN-L =       "1049-3301",
  bibdate =      "Tue Apr 21 08:08:16 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomacs.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3364218",
  abstract =     "Load imbalance has always been a crucial challenge in
                 Parallel Discrete Event Simulation (PDES). In the past
                 few years, we have witnessed an increased interest in
                 using multithreading PDES on multi/many-core platforms.
                 In multithreading PDES, migrating \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Modeling and Computer Simulation",
  journal-URL =  "https://dl.acm.org/loi/tomacs",
}

@Misc{Yee:2020:CMT,
  author =       "Alexander J. Yee",
  title =        "{{\tt y-cruncher}}: a multi-threaded pi-program",
  howpublished = "Web site",
  day =          "30",
  month =        mar,
  year =         "2020",
  bibdate =      "Tue Apr 21 16:09:31 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pi.bib",
  URL =          "http://www.numberworld.org/y-cruncher/",
  abstract =     "How fast can your computer compute Pi?\par

                 y-cruncher is a program that can compute Pi and other
                 constants to trillions of digits.\par

                 It is the first of its kind that is multi-threaded and
                 scalable to multi-core systems. Ever since its launch
                 in 2009, it has become a common benchmarking and
                 stress-testing application for overclockers and
                 hardware enthusiasts.\par

                 y-cruncher has been used to set several world records
                 for the most digits of Pi ever computed:\par

                 50 trillion digits - January 2020 (Timothy
                 Mullican)\par

                 31.4 trillion digits - January 2019 (Emma Haruka
                 Iwao)\par

                 22.4 trillion digits - November 2016 (Peter
                 Trueb)\par

                 13.3 trillion digits - October 2014 (Sandon Van Ness
                 ``houkouonchi'')\par

                 12.1 trillion digits - December 2013 (Shigeru
                 Kondo)\par

                 10 trillion digits - October 2011 (Shigeru Kondo)\par

                 5 trillion digits - August 2010 (Shigeru Kondo)",
  acknowledgement = ack-nhfb,
}

@Article{Yin:2020:SCA,
  author =       "L. Yin and W. Dong and W. Liu and J. Wang",
  title =        "On Scheduling Constraint Abstraction for
                 Multi-Threaded Program Verification",
  journal =      j-IEEE-TRANS-SOFTW-ENG,
  volume =       "46",
  number =       "5",
  pages =        "549--565",
  year =         "2020",
  CODEN =        "IESEDJ",
  ISSN =         "0098-5589 (print), 1939-3520 (electronic)",
  ISSN-L =       "0098-5589",
  bibdate =      "Thu Sep 17 07:36:32 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Software Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
}

@Article{Akbari:2021:EMT,
  author =       "Amir Akbari and Dennis Giannacopoulos",
  title =        "An efficient multi-threaded {Newton--Raphson}
                 algorithm for strong coupling modeling of multi-physics
                 problems",
  journal =      j-COMP-PHYS-COMM,
  volume =       "258",
  number =       "??",
  pages =        "Article 107563",
  month =        jan,
  year =         "2021",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107563",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Mar 13 08:21:40 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465520302708",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Arman:2021:OHP,
  author =       "Arif Arman and Dmitri Loguinov",
  title =        "{Origami}: a high-performance mergesort framework",
  journal =      j-PROC-VLDB-ENDOWMENT,
  volume =       "15",
  number =       "2",
  pages =        "259--271",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.14778/3489496.3489507",
  ISSN =         "2150-8097",
  bibdate =      "Sat Feb 5 06:26:54 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
  URL =          "https://dl.acm.org/doi/10.14778/3489496.3489507",
  abstract =     "Mergesort is a popular algorithm for sorting
                 real-world workloads as it is immune to data skewness,
                 suitable for parallelization using vectorized
                 intrinsics, and relatively simple to multi-thread. In
                 this paper, we introduce Origami, an in-memory merge-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "Proceedings of the VLDB Endowment",
  journal-URL =  "https://dl.acm.org/loi/pvldb",
}

@Article{Arslan:2021:ESR,
  author =       "Sanem Arslan and Osman Unsal",
  title =        "Efficient selective replication of critical code
                 regions for {SDC} mitigation leveraging redundant
                 multithreading",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "12",
  pages =        "14130--14160",
  month =        dec,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03804-6",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:31 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03804-6",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Baumann:2021:CBV,
  author =       "Pascal Baumann and Rupak Majumdar and Ramanathan S.
                 Thinniyam and Georg Zetzsche",
  title =        "Context-bounded verification of liveness properties
                 for multithreaded shared-memory programs",
  journal =      j-PACMPL,
  volume =       "5",
  number =       "POPL",
  pages =        "44:1--44:31",
  month =        jan,
  year =         "2021",
  DOI =          "https://doi.org/10.1145/3434325",
  bibdate =      "Tue Mar 30 08:10:58 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pacmpl.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434325",
  abstract =     "We study context-bounded verification of liveness
                 properties of multi-threaded, shared-memory programs,
                 where each thread can spawn additional threads. Our
                 main result shows that context-bounded fair termination
                 is decidable for the model; context-. \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "Proceedings of the ACM on Programming Languages",
  journal-URL =  "https://pacmpl.acm.org/",
}

@Article{Carroll:2021:ELT,
  author =       "Shane Carroll and Wei-ming Lin",
  title =        "Exploiting Long-Term Temporal Cache Access Patterns
                 for {LRU} Insertion Prioritization",
  journal =      j-PARALLEL-PROCESS-LETT,
  volume =       "31",
  number =       "02",
  pages =        "??--??",
  month =        jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1142/S0129626421500109",
  ISSN =         "0129-6264 (print), 1793-642X (electronic)",
  ISSN-L =       "0129-6264",
  bibdate =      "Thu Feb 17 06:50:36 MST 2022",
  bibsource =    "http://ejournals.wspc.com.sg/ppl/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib",
  URL =          "https://www.worldscientific.com/doi/10.1142/S0129626421500109",
  abstract =     "In a CPU cache utilizing least recently used (LRU)
                 replacement, cache sets manage a buffer which orders
                 all cache lines in the set from LRU to most recently
                 used (MRU). When a cache line is brought into cache, it
                 is placed at the MRU and the LRU line is evicted. When
                 re-accessed, a line is promoted to the MRU position.
                 LRU replacement provides a simple heuristic to predict
                 the optimal cache line to evict. However, LRU utilizes
                 only simple, short-term access patterns. In this paper,
                 we propose a method that uses a buffer called the
                 history queue to record longer-term access-eviction
                 patterns than the LRU buffer can capture. Using this
                 information, we make a simple modification to LRU
                 insertion policy such that recently-recalled blocks
                 have priority over others. As lines are evicted, their
                 addresses are recorded in a FIFO history queue.
                 Incoming lines that have recently been evicted and now
                 recalled (those in the history queue at recall time)
                 remain in the MRU for an extended period of time as
                 non-recalled lines entering the cache thereafter are
                 placed below the MRU. We show that the proposed LRU
                 insertion prioritization increases performance in
                 single-threaded and multi-threaded workloads in
                 simulations with simple adjustments to baseline LRU.",
  acknowledgement = ack-nhfb,
  articleno =    "2150010",
  fjournal =     "Parallel Processing Letters",
  journal-URL =  "http://www.worldscientific.com/loi/ppl",
}

@Article{Cheikh:2021:KDV,
  author =       "A. Cheikh and S. Sordillo and A. Mastrandrea and F.
                 Menichelli and G. Scotti and M. Olivieri",
  title =        "{Klessydra-T}: Designing Vector Coprocessors for
                 Multithreaded Edge-Computing Cores",
  journal =      j-IEEE-MICRO,
  volume =       "41",
  number =       "2",
  pages =        "64--71",
  month =        mar # "\slash " # apr,
  year =         "2021",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2021.3050962",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Apr 1 10:32:23 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Conoci:2021:PCP,
  author =       "Stefano Conoci and Pierangelo {Di Sanzo} and
                 Alessandro Pellegrini and Bruno Ciciani and Francesco
                 Quaglia",
  title =        "On power capping and performance optimization of
                 multithreaded applications",
  journal =      j-CCPE,
  volume =       "33",
  number =       "13",
  pages =        "e6205:1--e6205:??",
  day =          "10",
  month =        jul,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.6205",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:49:54 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "27 January 2021",
}

@Article{Kozicky:2021:JDT,
  author =       "Claudio Kozick{\'y} and Ivan Simecek",
  title =        "Joint direct and transposed sparse matrix-vector
                 multiplication for multithreaded {CPUs}",
  journal =      j-CCPE,
  volume =       "33",
  number =       "13",
  pages =        "e6236:1--e6236:??",
  day =          "10",
  month =        jul,
  year =         "2021",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.6236",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  ISSN-L =       "1532-0626",
  bibdate =      "Tue Feb 22 09:49:54 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "Concurrency Computat., Pract. Exper.",
  fjournal =     "Concurrency and Computation: Practice and Experience",
  journal-URL =  "http://www.interscience.wiley.com/jpages/1532-0626",
  onlinedate =   "22 February 2021",
}

@Article{Li:2021:MEC,
  author =       "Botao Li and Synge Todo and A. C. Maggs and Werner
                 Krauth",
  title =        "Multithreaded event-chain {Monte Carlo} with local
                 times",
  journal =      j-COMP-PHYS-COMM,
  volume =       "261",
  number =       "??",
  pages =        "Article 107702",
  month =        apr,
  year =         "2021",
  CODEN =        "CPHCBZ",
  DOI =          "https://doi.org/10.1016/j.cpc.2020.107702",
  ISSN =         "0010-4655 (print), 1879-2944 (electronic)",
  ISSN-L =       "0010-4655",
  bibdate =      "Sat Mar 13 08:21:42 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0010465520303453",
  acknowledgement = ack-nhfb,
  fjournal =     "Computer Physics Communications",
  journal-URL =  "http://www.sciencedirect.com/science/journal/00104655",
}

@Article{Ma:2021:RTB,
  author =       "Xiaoxue Ma and Shangru Wu and Ernest Pobee and Xiupei
                 Mei and Hao Zhang and Bo Jiang and Wing-Kwong Chan",
  title =        "{RegionTrack}: a Trace-Based Sound and Complete
                 Checker to Debug Transactional Atomicity Violations and
                 Non-Serializable Traces",
  journal =      j-TOSEM,
  volume =       "30",
  number =       "1",
  pages =        "7:1--7:49",
  month =        jan,
  year =         "2021",
  CODEN =        "ATSMER",
  DOI =          "https://doi.org/10.1145/3412377",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Fri Jan 22 07:02:14 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tosem.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3412377",
  abstract =     "Atomicity is a correctness criterion to reason about
                 isolated code regions in a multithreaded program when
                 they are executed concurrently. However, dynamic
                 instances of these code regions, called transactions,
                 may fail to behave atomically, resulting in \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  journal-URL =  "https://dl.acm.org/loi/tosem",
}

@Article{Mattson:2021:PPM,
  author =       "Timothy G. Mattson and Todd A. Anderson and Giorgis
                 Georgakoudis",
  title =        "\pkg{PyOMP}: Multithreaded Parallel Programming in
                 {Python}",
  journal =      j-COMPUT-SCI-ENG,
  volume =       "23",
  number =       "6",
  pages =        "77--80",
  month =        nov # "\slash " # dec,
  year =         "2021",
  CODEN =        "CSENFA",
  DOI =          "https://doi.org/10.1109/MCSE.2021.3128806",
  ISSN =         "1521-9615 (print), 1558-366X (electronic)",
  ISSN-L =       "1521-9615",
  bibdate =      "Mon Jan 31 16:30:09 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/computscieng.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "Computing in Science and Engineering",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
}

@Article{Metzger:2021:DHT,
  author =       "Paul Metzger and Volker Seeker and Christian Fensch
                 and Murray Cole",
  title =        "Device Hopping: Transparent Mid-Kernel Runtime
                 Switching for Heterogeneous Systems",
  journal =      j-TACO,
  volume =       "18",
  number =       "4",
  pages =        "57:1--57:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471909",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 4 07:14:07 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471909",
  abstract =     "Existing OS techniques for homogeneous many-core
                 systems make it simple for single and multithreaded
                 applications to migrate between cores. Heterogeneous
                 systems do not benefit so fully from this flexibility,
                 and applications that cannot migrate in mid-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Nagler:2021:CSR,
  author =       "Thomas Nagler",
  title =        "Code Snippet: {R}-Friendly Multi-Threading in {C++}",
  journal =      j-J-STAT-SOFT,
  volume =       "97",
  number =       "??",
  pages =        "??--??",
  month =        "????",
  year =         "2021",
  CODEN =        "JSSOBK",
  DOI =          "https://doi.org/10.18637/jss.v97.c01",
  ISSN =         "1548-7660",
  ISSN-L =       "1548-7660",
  bibdate =      "Wed May 19 07:43:42 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jstatsoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://www.jstatsoft.org/index.php/jss/article/view/v097c01;
                 https://www.jstatsoft.org/index.php/jss/article/view/v097c01/v97c01.pdf",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.jstatsoft.org/",
}

@Article{Romanous:2021:ELL,
  author =       "Bashar Romanous and Skyler Windh and Vassilis
                 Tsotras",
  title =        "Efficient local locking for massively multithreaded
                 in-memory hash-based operators",
  journal =      j-VLDB-J,
  volume =       "30",
  number =       "3",
  pages =        "333--359",
  month =        may,
  year =         "2021",
  CODEN =        "VLDBFR",
  DOI =          "https://doi.org/10.1007/s00778-020-00642-5",
  ISSN =         "1066-8888 (print), 0949-877X (electronic)",
  ISSN-L =       "1066-8888",
  bibdate =      "Sat Apr 9 10:33:58 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldbj.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://link.springer.com/article/10.1007/s00778-020-00642-5",
  acknowledgement = ack-nhfb,
  ajournal =     "VLDB J.",
  fjournal =     "VLDB Journal: Very Large Data Bases",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J869",
}

@Article{Sonenberg:2021:PAW,
  author =       "Nikki Sonenberg and Grzegorz Kielanski and Benny {Van
                 Houdt}",
  title =        "Performance Analysis of Work Stealing in Large-scale
                 Multithreaded Computing",
  journal =      j-TOMPECS,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:28",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470887",
  ISSN =         "2376-3639 (print), 2376-3647 (electronic)",
  ISSN-L =       "2376-3639",
  bibdate =      "Wed Mar 2 06:32:09 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tompecs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470887",
  abstract =     "Randomized work stealing is used in distributed
                 systems to increase performance and improve resource
                 utilization. In this article, we consider randomized
                 work stealing in a large system of homogeneous
                 processors where parent jobs spawn child jobs that can
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Modeling and Performance
                 Evaluation of Computing Systems (TOMPECS)",
  journal-URL =  "https://dl.acm.org/loi/tompecs",
}

@Article{Steele:2021:PLB,
  author =       "Guy L. {Steele Jr.} and Sebastiano Vigna",
  title =        "\pkg{LXM}: better splittable pseudorandom number
                 generators (and almost as fast)",
  journal =      j-PACMPL,
  volume =       "5",
  number =       "OOPSLA",
  pages =        "148:1--148:31",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485525",
  ISSN =         "2475-1421 (electronic)",
  ISSN-L =       "2475-1421",
  bibdate =      "Wed Mar 2 07:00:43 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485525",
  abstract =     "In 2014, Steele, Lea, and Flood presented SplitMix, an
                 object-oriented pseudorandom number generator (prng)
                 that is quite fast (9 64-bit arithmetic/logical
                 operations per 64 bits generated) and also splittable.
                 A conventional prng object provides a generate method
                 that returns one pseudorandom value and updates the
                 state of the prng; a splittable prng object also has a
                 second operation, split, that replaces the original
                 prng object with two (seemingly) independent prng
                 objects, by creating and returning a new such object
                 and updating the state of the original object.
                 Splittable prng objects make it easy to organize the
                 use of pseudorandom numbers in multithreaded programs
                 structured using fork-join parallelism. This overall
                 strategy still appears to be sound, but the specific
                 arithmetic calculation used for generate in the
                 SplitMix algorithm has some detectable weaknesses, and
                 the period of any one generator is limited to
                 264.\par

                 Here we present the LXM family of prng algorithms. The
                 idea is an old one: combine the outputs of two
                 independent prng algorithms, then (optionally) feed the
                 result to a mixing function. An LXM algorithm uses a
                 linear congruential subgenerator and an F2-linear
                 subgenerator; the examples studied in this paper use a
                 linear congruential generator (LCG) of period 216, 232,
                 264, or 2128 with one of the multipliers recommended by
                 L'Ecuyer or by Steele and Vigna, and an F2-linear
                 xor-based generator (XBG) of the xoshiro family or
                 xoroshiro family as described by Blackman and Vigna.
                 For mixing functions we study the MurmurHash3 finalizer
                 function; variants by David Stafford, Doug Lea, and
                 degski; and the null (identity) mixing
                 function.\par

                 Like SplitMix, LXM provides both a generate operation
                 and a split operation. Also like SplitMix, LXM requires
                 no locking or other synchronization (other than the
                 usual memory fence after instance initialization), and
                 is suitable for use with simd instruction sets because
                 it has no branches or loops.\par

                 We analyze the period and equidistribution properties
                 of LXM generators, and present the results of thorough
                 testing of specific members of this family, using the
                 TestU01 and PractRand test suites, not only on single
                 instances of the algorithm but also for collections of
                 instances, used in parallel, ranging in size from 2 to
                 224. Single instances of LXM that include a strong
                 mixing function appear to have no major weaknesses, and
                 LXM is significantly more robust than SplitMix against
                 accidental correlation in a multithreaded setting. We
                 believe that LXM, like SplitMix, is suitable for
                 ``everyday'' scientific and machine-learning
                 applications (but not cryptographic applications),
                 especially when concurrent threads or distributed
                 processes are involved.",
  acknowledgement = ack-nhfb,
  articleno =    "148",
  fjournal =     "Proceedings of the ACM on Programming Languages
                 (PACMPL)",
  journal-URL =  "https://dl.acm.org/loi/pacmpl",
}

@Article{Tang:2021:MMR,
  author =       "Xulong Tang and Mahmut Taylan Kandemir and Mustafa
                 Karakoy",
  title =        "Mix and Match: Reorganizing Tasks for Enhancing Data
                 Locality",
  journal =      j-POMACS,
  volume =       "5",
  number =       "2",
  pages =        "20:1--20:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460087",
  ISSN =         "2476-1249",
  ISSN-L =       "2476-1249",
  bibdate =      "Wed Mar 2 06:36:38 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pomacs.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460087",
  abstract =     "Application programs that exhibit strong locality of
                 reference lead to minimized cache misses and better
                 performance in different architectures. However, to
                 maximize the performance of multithreaded applications
                 running on emerging manycore systems, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "Proceedings of the ACM on Measurement and Analysis of
                 Computing Systems (POMACS)",
  journal-URL =  "https://dl.acm.org/loi/pomacs",
}

@Article{Tao:2021:CDS,
  author =       "Xiaohan Tao and Jianmin Pang and Yu Zhu",
  title =        "Compiler-directed scratchpad memory data transfer
                 optimization for multithreaded applications on a
                 heterogeneous many-core architecture",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "77",
  number =       "12",
  pages =        "14502--14524",
  month =        dec,
  year =         "2021",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03853-x",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:31 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03853-x",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Anju:2022:MID,
  author =       "M. A. Anju and Rupesh Nasre",
  title =        "Multi-Interval {DomLock}: Toward Improving Concurrency
                 in Hierarchies",
  journal =      j-TOPC,
  volume =       "9",
  number =       "3",
  pages =        "12:1--12:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543543",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 20 09:34:53 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543543",
  abstract =     "Locking has been a predominant technique depended upon
                 for achieving thread synchronization and ensuring
                 correctness in multi-threaded applications. It has been
                 established that the concurrent applications working
                 with hierarchical data witness \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Cheng:2022:EMA,
  author =       "Jianyi Cheng and Shane T. Fleming and Yu Ting Chen and
                 Jason Anderson and John Wickerson and George A.
                 Constantinides",
  title =        "Efficient Memory Arbitration in High-Level Synthesis
                 From Multi-Threaded Code",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "4",
  pages =        "933--946",
  month =        apr,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3066466",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Mar 17 06:38:17 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Feliu:2022:VVM,
  author =       "Josu{\'e} Feliu and Ajeya Naithani and Julio
                 Sahuquillo and Salvador Petit and Moinuddin Qureshi and
                 Lieven Eeckhout",
  title =        "{VMT}: Virtualized Multi-Threading for Accelerating
                 Graph Workloads on Commodity Processors",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "6",
  pages =        "1386--1398",
  month =        jun,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3086069",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Wed May 25 09:41:19 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Inverso:2022:BVM,
  author =       "Omar Inverso and Ermenegildo Tomasco and Bernd Fischer
                 and Salvatore {La Torre} and Gennaro Parlato",
  title =        "Bounded Verification of Multi-threaded Programs via
                 Lazy Sequentialization",
  journal =      j-TOPLAS,
  volume =       "44",
  number =       "1",
  pages =        "1:1--1:50",
  month =        mar,
  year =         "2022",
  CODEN =        "ATPSDT",
  DOI =          "https://doi.org/10.1145/3478536",
  ISSN =         "0164-0925 (print), 1558-4593 (electronic)",
  ISSN-L =       "0164-0925",
  bibdate =      "Fri Jan 14 06:53:13 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/toplas.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3478536",
  abstract =     "Bounded verification techniques such as bounded model
                 checking (BMC) have successfully been used for many
                 practical program analysis problems, but concurrency
                 still poses a challenge. Here, we describe a new
                 approach to BMC of sequentially consistent \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Programming Languages and
                 Systems",
  journal-URL =  "https://dl.acm.org/loi/toplas",
}

@Article{Kelefouras:2022:WSM,
  author =       "Vasilios Kelefouras and Karim Djemame",
  title =        "Workflow simulation and multi-threading aware task
                 scheduling for heterogeneous computing",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "168",
  number =       "??",
  pages =        "17--32",
  month =        oct,
  year =         "2022",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2022.05.011",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Sat Jul 16 10:35:47 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731522001265",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Korndorfer:2022:LDL,
  author =       "Jonas H. M{\"u}ller Kornd{\"o}rfer and Ahmed Eleliemy
                 and Ali Mohammed and Florina M. Ciorba",
  title =        "{LB4OMP}: a Dynamic Load Balancing Library for
                 Multithreaded Applications",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "33",
  number =       "4",
  pages =        "830--841",
  month =        apr,
  year =         "2022",
  CODEN =        "ITDSEO",
  DOI =          "https://doi.org/10.1109/TPDS.2021.3107775",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  ISSN-L =       "1045-9219",
  bibdate =      "Thu Nov 11 08:39:34 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Parallel and Distributed
                 Systems",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
}

@Article{Minutoli:2022:PSH,
  author =       "Marco Minutoli and Vito Giovanni Castellana and Nicola
                 Saporetti and Stefano Devecchi and Marco Lattuada and
                 Pietro Fezzardi and Antonino Tumeo and Fabrizio
                 Ferrandi",
  title =        "\pkg{Svelto}: High-Level Synthesis of Multi-Threaded
                 Accelerators for Graph Analytics",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "3",
  pages =        "520--533",
  month =        mar,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3057860",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Feb 17 08:09:56 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Myllykoski:2022:ATB,
  author =       "Mirko Myllykoski",
  title =        "{Algorithm 1019}: a Task-based Multi-shift {$ Q R $
                 \slash $ Q Z $} Algorithm with Aggressive Early
                 Deflation",
  journal =      j-TOMS,
  volume =       "48",
  number =       "1",
  pages =        "11:1--11:36",
  month =        mar,
  year =         "2022",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3495005",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Thu Feb 17 08:00:57 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/toms.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3495005",
  abstract =     "The $ Q R $ algorithm is one of the three phases in
                 the process of computing the eigenvalues and the
                 eigenvectors of a dense nonsymmetric matrix. This paper
                 describes a task-based $ Q R $ algorithm for reducing
                 an upper Hessenberg matrix to real Schur form. The
                 task-based algorithm also supports generalized
                 eigenvalue problems ($ Q Z $ algorithm) but this paper
                 concentrates on the standard case. The task-based
                 algorithm adopts previous algorithmic improvements,
                 such as tightly-coupled multi-shifts and Aggressive
                 Early Deflation (AED), and also incorporates several
                 new ideas that significantly improve the performance.
                 This includes, but is not limited to, the elimination
                 of several synchronization points, the dynamic merging
                 of previously separate computational steps, the
                 shortening and the prioritization of the critical path,
                 and experimental GPU support. The task-based
                 implementation is demonstrated to be multiple times
                 faster than multi-threaded LAPACK and ScaLAPACK in both
                 single-node and multi-node configurations on two
                 different machines based on Intel and AMD CPUs. The
                 implementation is built on top of the StarPU runtime
                 system and is part of the open-source StarNEig
                 library.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Perrin:2022:EWF,
  author =       "Matthieu Perrin and Achour Most{\'e}faoui and Ludmila
                 Courtillat-Piazza",
  title =        "Extending the wait-free hierarchy to multi-threaded
                 systems",
  journal =      j-DISTRIB-COMPUT,
  volume =       "35",
  number =       "4",
  pages =        "375--398",
  month =        aug,
  year =         "2022",
  CODEN =        "DICOEB",
  DOI =          "https://doi.org/10.1007/s00446-022-00425-x",
  ISSN =         "0178-2770 (print), 1432-0452 (electronic)",
  ISSN-L =       "0178-2770",
  bibdate =      "Mon Aug 1 08:49:35 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/distribcomput.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://link.springer.com/article/10.1007/s00446-022-00425-x",
  acknowledgement = ack-nhfb,
  ajournal =     "Distrib. comput.",
  fjournal =     "Distributed Computing",
  journal-URL =  "https://link.springer.com/journal/446",
}

@Article{Pons:2022:EHT,
  author =       "Luc{\'{\i}}a Pons and Josu{\'e} Feliu and Jos{\'e}
                 Puche and Chaoyi Huang and Salvador Petit and Julio
                 Pons and Mar{\'{\i}}a E. G{\'o}mez and Julio
                 Sahuquillo",
  title =        "Effect of Hyper-Threading in Latency-Critical
                 Multithreaded Cloud Applications and Utilization
                 Analysis of the Major System Resources",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "131",
  number =       "??",
  pages =        "194--208",
  month =        jun,
  year =         "2022",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2022.01.025",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Wed Mar 9 17:27:32 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/futgencompsys2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X22000334",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Raad:2022:EIX,
  author =       "Azalea Raad and Luc Maranget and Viktor Vafeiadis",
  title =        "Extending {Intel-x86} consistency and persistency:
                 formalising the semantics of {Intel-x86} memory types
                 and non-temporal stores",
  journal =      j-PACMPL,
  volume =       "6",
  number =       "POPL",
  pages =        "22:1--22:31",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3498683",
  ISSN =         "2475-1421 (electronic)",
  ISSN-L =       "2475-1421",
  bibdate =      "Thu May 26 06:32:48 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3498683",
  abstract =     "Existing semantic formalisations of the Intel-x86
                 architecture cover only a small fragment of its
                 available features that are relevant for the
                 consistency semantics of multi-threaded programs as
                 well as the persistency semantics of programs
                 interfacing \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "Proceedings of the ACM on Programming Languages
                 (PACMPL)",
  journal-URL =  "https://dl.acm.org/loi/pacmpl",
}

@Article{Ritchie:2022:DPF,
  author =       "Robert Ritchie and Khodakhast Bibak",
  title =        "\pkg{DOTMIX-Pro}: faster and more efficient variants
                 of {DOTMIX} for dynamic-multithreading platforms",
  journal =      j-J-SUPERCOMPUTING,
  volume =       "78",
  number =       "1",
  pages =        "945--961",
  month =        jan,
  year =         "2022",
  CODEN =        "JOSUED",
  DOI =          "https://doi.org/10.1007/s11227-021-03904-3",
  ISSN =         "0920-8542 (print), 1573-0484 (electronic)",
  ISSN-L =       "0920-8542",
  bibdate =      "Mon Feb 28 16:44:33 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://link.springer.com/article/10.1007/s11227-021-03904-3",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Supercomputing",
  fjournal =     "The Journal of Supercomputing",
  journal-URL =  "http://link.springer.com/journal/11227",
}

@Article{Rodriguez:2022:EHB,
  author =       "Alfonso Rodr{\'\i}guez and Andr{\'e}s Otero and Marco
                 Platzner and Eduardo de la Torre",
  title =        "Exploiting Hardware-Based Data-Parallel and
                 Multithreading Models for Smart Edge Computing in
                 Reconfigurable {FPGAs}",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "11",
  pages =        "2903--2914",
  month =        nov,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3107196",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Oct 27 15:52:25 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Schwab:2022:SSV,
  author =       "Michail Schwab and David Saffo and Nicholas Bond and
                 Shash Sinha and Cody Dunne and Jeff Huang and James
                 Tompkin and Michelle A. Borkin",
  title =        "Scalable Scalable Vector Graphics: Automatic
                 Translation of Interactive {SVGs} to a Multithread
                 {VDOM} for Fast Rendering",
  journal =      j-IEEE-TRANS-VIS-COMPUT-GRAPH,
  volume =       "28",
  number =       "9",
  pages =        "3219--3234",
  month =        sep,
  year =         "2022",
  CODEN =        "ITVGEA",
  DOI =          "https://doi.org/10.1109/TVCG.2021.3059294",
  ISSN =         "1077-2626",
  ISSN-L =       "1077-2626",
  bibdate =      "Thu Aug 4 06:28:31 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Visualization and Computer
                 Graphics",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
}

@Article{Trotter:2022:MTO,
  author =       "James D. Trotter and Xing Cai and Simon W. Funke",
  title =        "On Memory Traffic and Optimisations for Low-order
                 Finite Element Assembly Algorithms on Multi-core
                 {CPUs}",
  journal =      j-TOMS,
  volume =       "48",
  number =       "2",
  pages =        "19:1--19:31",
  month =        jun,
  year =         "2022",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/3503925",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Wed Jul 20 07:04:17 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/toms.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3503925",
  abstract =     "Motivated by the wish to understand the achievable
                 performance of finite element assembly on unstructured
                 computational meshes, we dissect the standard cellwise
                 assembly algorithm into four kernels, two of which are
                 dominated by irregular memory traffic. Several
                 optimisation schemes are studied together with
                 associated lower and upper bounds on the estimated
                 memory traffic volume. Apart from properly reordering
                 the mesh entities, the two most significant
                 optimisations include adopting a lookup table in adding
                 element matrices or vectors to their global
                 counterparts, and using a row-wise assembly algorithm
                 for multi-threaded parallelisation. Rigorous
                 benchmarking shows that, due to the various
                 optimisations, the actual volumes of memory traffic are
                 in many cases very close to the estimated lower bounds.
                 These results confirm the effectiveness of the
                 optimisations, while also providing a recipe for
                 developing efficient software for finite element
                 assembly.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "https://dl.acm.org/loi/toms",
}

@Article{Wang:2022:ASM,
  author =       "Zhe Wang and Chen Xu and Kunal Agrawal and Jing Li",
  title =        "Adaptive scheduling of multiprogrammed
                 dynamic-multithreading applications",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "162",
  number =       "??",
  pages =        "76--88",
  month =        apr,
  year =         "2022",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1016/j.jpdc.2022.01.009",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  ISSN-L =       "0743-7315",
  bibdate =      "Thu Feb 10 06:39:27 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0743731522000144",
  acknowledgement = ack-nhfb,
  fjournal =     "Journal of Parallel and Distributed Computing",
  journal-URL =  "http://www.sciencedirect.com/science/journal/07437315",
}

@Article{Zou:2022:BSP,
  author =       "Changwei Zou and Xudong Wang and Yaoqing Gao and
                 Jingling Xue",
  title =        "Buddy Stacks: Protecting Return Addresses with
                 Efficient Thread-Local Storage and Runtime
                 Re-Randomization",
  journal =      j-TOSEM,
  volume =       "31",
  number =       "2",
  pages =        "35e:1--35e:37",
  month =        apr,
  year =         "2022",
  CODEN =        "ATSMER",
  DOI =          "https://doi.org/10.1145/3494516",
  ISSN =         "1049-331X (print), 1557-7392 (electronic)",
  ISSN-L =       "1049-331X",
  bibdate =      "Tue May 24 07:09:20 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tosem.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494516",
  abstract =     "Shadow stacks play an important role in protecting
                 return addresses to mitigate ROP attacks. Parallel
                 shadow stacks, which shadow the call stack of each
                 thread at the same constant offset for all threads, are
                 known not to support multi-threading well. On
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "35e",
  fjournal =     "ACM Transactions on Software Engineering and
                 Methodology",
  journal-URL =  "https://dl.acm.org/loi/tosem",
}

%%% ====================================================================
%%% Cross-referenced entries must come last:
@Proceedings{IEEE:1989:WOS,
  editor =       "{IEEE}",
  booktitle =    "Workstation Operating Systems: Proceedings of the
                 Second Workshop on Workstation Operating Systems
                 (WWOS-II), Pacific Grove, CA, USA, September 27--29,
                 1989",
  title =        "Workstation Operating Systems: Proceedings of the
                 Second Workshop on Workstation Operating Systems
                 ({WWOS}-{II}), Pacific Grove, {CA}, {USA}, September
                 27--29, 1989",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 134",
  year =         "1989",
  bibdate =      "Sat Sep 28 20:21:01 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 89TH0281-6.",
  acknowledgement = ack-nhfb,
  classification = "B0100 (General electrical engineering topics);
                 B6210L (Computer communications); C5430
                 (Microcomputers); C5620 (Computer networks and
                 techniques); C5630 (Networking equipment); C6120 (File
                 organisation); C6150J (Operating systems); C6155
                 (Computer communications software)",
  confsponsor =  "IEEE",
  keywords =     "AIX3; At-most-once message; Coda file system; Echo
                 distributed file system; Fault-tolerant multiprocessor
                 workstations; File implementation; File-server
                 statelessness; Global communication interface; Guide
                 operating system; Large-scale applications; Mach;
                 Multimedia applications; Object-oriented environments;
                 Open operating system; Parallel algorithms; PLURIX;
                 PROST; Prototype information environment; Raven
                 project; Replicated servers; Shared memory; Sprite;
                 Synchronized clocks; Ubik database; Very large
                 distributed systems; Virtual memory; Virtual systems;
                 Workstation networks; Workstation-network communication
                 interface; X-kernel",
  thesaurus =    "Computer communications software; Computer networks;
                 File organisation; File servers; Operating systems
                 [computers]; Workstations",
}

@Proceedings{USENIX:1989:PWU,
  editor =       "{USENIX Association}",
  booktitle =    "Proceedings of the Winter 1989 {USENIX} Conference:
                 January 30--February 3, 1989, San Diego, California,
                 {USA}",
  title =        "Proceedings of the Winter 1989 {USENIX} Conference:
                 January 30--February 3, 1989, San Diego, California,
                 {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "x + 471",
  year =         "1989",
  bibdate =      "Sun Feb 18 07:46:09 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "UNIX (Computer operating system) --- Congresses.",
}

@Proceedings{Anonymous:1990:PWU,
  editor =       "Anonymous",
  booktitle =    "Proceedings of the Winter 1990 USENIX Conference,
                 Washington, DC, USA, January 22--26, 1990",
  title =        "Proceedings of the Winter 1990 {USENIX} Conference,
                 Washington, {DC}, {USA}, January 22--26, 1990",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "xvi + 374",
  year =         "1990",
  bibdate =      "Sat Sep 28 20:03:34 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1990:PSN,
  editor =       "{IEEE}",
  booktitle =    "Proceedings, Supercomputing '90: November 12--16,
                 1990, New York Hilton at Rockefeller Center, New York,
                 New York",
  title =        "Proceedings, Supercomputing '90: November 12--16,
                 1990, New York Hilton at Rockefeller Center, New York,
                 New York",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxv + 982",
  year =         "1990",
  ISBN =         "0-8186-2056-0 (paperback: IEEE Computer Society),
                 0-89791-412-0 (paperback: ACM)",
  ISBN-13 =      "978-0-8186-2056-0 (paperback: IEEE Computer Society),
                 978-0-89791-412-3 (paperback: ACM)",
  LCCN =         "QA 76.88 S87 1990",
  bibdate =      "Wed Aug 28 06:48:31 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 University of California MELVYL catalog.",
  note =         "ACM order number 415903. IEEE Computer Society Press
                 order number 2056. IEEE catalog number 90CH2916-5.",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessor systems and techniques); C5470
                 (Performance evaluation and testing); C6110 (Systems
                 analysis and programming); C7000 (Computer
                 applications)",
  keywords =     "biological applications; computer applications;
                 computer chess; innovative architectures; linear
                 algebra algorithms; memory; networking computing;
                 parallel languages; parallel processing; particle
                 transport; partitioning; performance evaluation;
                 performance visualizations; pipeline processing;
                 program analysis; program restructuring; scheduling;
                 supercomputers --- congresses; vector algorithms",
}

@Proceedings{Anonymous:1991:PIS,
  editor =       "Anonymous",
  booktitle =    "{Proceedings of the International Symposium on
                 Supercomputing: Fukuoka, Japan, November 6--8, 1991}",
  title =        "{Proceedings of the International Symposium on
                 Supercomputing: Fukuoka, Japan, November 6--8, 1991}",
  publisher =    "Kyushu University Press",
  address =      "Fukuoka, Japan",
  pages =        "iv + 261",
  year =         "1991",
  ISBN =         "4-87378-284-8",
  ISBN-13 =      "978-4-87378-284-3",
  LCCN =         "QA76.88.I1991",
  bibdate =      "Fri Aug 30 08:01:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Supercomputers --- Congresses",
}

@Proceedings{USENIX:1991:PUM,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the {USENIX} Mach Symposium: November
                 20--22, 1991, Monterey, California, USA",
  title =        "Proceedings of the {USENIX} Mach Symposium: November
                 20--22, 1991, Monterey, California, {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "262",
  year =         "1991",
  LCCN =         "QAX 27",
  bibdate =      "Sun Feb 18 07:46:09 MST 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Memory management (Computer science) --- Congresses;
                 Operating systems (Computers) --- Congresses; UNIX
                 (Computer file) --- Congresses",
}

@Proceedings{USENIX:1991:PWU,
  editor =       "{USENIX}",
  key =          "USENIX-WINTER'91",
  booktitle =    "Proceedings of the Winter 1991 {USENIX} Conference:
                 January 21--January 25, 1991, Dallas, {TX}, {USA}",
  title =        "Proceedings of the Winter 1991 {USENIX} Conference:
                 January 21--January 25, 1991, Dallas, {TX}, {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "ix + 363",
  year =         "1991",
  LCCN =         "QA 76.76 O63 U84 1992",
  bibdate =      "Mon Jul 18 12:14:50 1994",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Computer networks --- Congresses; Operating systems
                 (Computers) --- Congresses; Programming (Electronic
                 computers) --- Congresses; UNIX (Computer file) ---
                 Congresses",
}

@Proceedings{Watt:1991:IPI,
  editor =       "Stephen M. Watt",
  booktitle =    "ISSAC '91: proceedings of the 1991 International
                 Symposium on Symbolic and Algebraic Computation, July
                 15--17, 1991, Bonn, Germany",
  title =        "{ISSAC} '91: proceedings of the 1991 International
                 Symposium on Symbolic and Algebraic Computation, July
                 15--17, 1991, Bonn, Germany",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xiii + 468",
  year =         "1991",
  ISBN =         "0-89791-437-6",
  ISBN-13 =      "978-0-89791-437-6",
  LCCN =         "QA 76.95 I59 1991",
  bibdate =      "Thu Sep 26 06:00:06 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dirac-p-a-m.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The following topics were dealt with: algorithms for
                 symbolic mathematical computation; languages, systems
                 and packages; computational geometry, group theory and
                 number theory; automatic theorem proving and
                 programming; interface of symbolics, numerics and
                 graphics; applications in mathematics, science and
                 engineering; and symbolic and algebraic computation in
                 education.",
  acknowledgement = ack-nhfb,
  classification = "C1160 (Combinatorial mathematics); C4130
                 (Interpolation and function approximation); C4210
                 (Formal logic); C4240 (Programming and algorithm
                 theory); C7310 (Mathematics)",
  confdate =     "15--17 July 1991",
  conflocation = "Bonn, Germany",
  confsponsor =  "ACM",
  keywords =     "algebra --- data processing --- congresses; Algebraic
                 computation; Algorithms; Automatic theorem proving;
                 Computational geometry; Education; Engineering;
                 Graphics; Group theory; Languages; Mathematics;
                 mathematics --- data processing --- congresses; Number
                 theory; Programming; Science; Symbolic mathematical
                 computation; Symbolics",
  pubcountry =   "USA",
  thesaurus =    "Computational complexity; Formal languages;
                 Interpolation; Number theory; Polynomials; Symbol
                 manipulation",
}

@Proceedings{ACM:1992:CPI,
  editor =       "{ACM}",
  booktitle =    "Conference proceedings / 1992 International Conference
                 on Supercomputing, July 19--23, 1992, Washington, DC",
  title =        "Conference proceedings / 1992 International Conference
                 on Supercomputing, July 19--23, 1992, Washington,
                 {DC}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "x + 485",
  year =         "1992",
  ISBN =         "0-89791-485-6 (paperback), 0-89791-486-4",
  ISBN-13 =      "978-0-89791-485-7 (paperback), 978-0-89791-486-4",
  LCCN =         "QA 76.88 I57 1992",
  bibdate =      "Wed Aug 28 06:48:31 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 University of California MELVYL catalog.",
  note =         "Sponsored by ACM SIGARCH.",
  acknowledgement = ack-nhfb,
  keywords =     "supercomputers --- congresses",
}

@Proceedings{IEEE:1992:PSM,
  editor =       "{IEEE Computer Society. Technical Committee on
                 Computer Architecture}",
  booktitle =    "Proceedings, Supercomputing '92: Minneapolis,
                 Minnesota, November 16-20, 1992",
  title =        "Proceedings, Supercomputing '92: Minneapolis,
                 Minnesota, November 16-20, 1992",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxiv + 848",
  year =         "1992",
  ISBN =         "0-8186-2632-1 (case), 0-8186-2630-5 (paper),
                 0-8186-2631-3 (microfiche), 0-89791-537-2 (ACM Library
                 series)",
  ISBN-13 =      "978-0-8186-2632-6 (case), 978-0-8186-2630-2 (paper),
                 978-0-8186-2631-9 (microfiche), 978-0-89791-537-3 (ACM
                 Library series)",
  LCCN =         "QA76.5 .S894 1992",
  bibdate =      "Wed Aug 28 06:48:31 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 University of California MELVYL catalog.",
  note =         "Cover title: Supercomputing '91. ACM order number
                 415922. IEEE Computer Society Press order number 2630
                 IEEE catalog number 92CH3216-9.",
  acknowledgement = ack-nhfb,
  keywords =     "artificial intelligence; biosciences; cache;
                 compiling; distributed computing; fluids; industrial
                 modeling; instruction-level optimization;
                 interconnections; massively parallel systems;
                 multiprocessing programs; multiprocessing systems;
                 numerical applications; parallel algorithms; parallel
                 programming; parallelizing transformations; particles;
                 performance evaluation; performance methodology;
                 register efficiency; scheduling; sparse matrix
                 algorithms; supercomputers --- congresses; symbolic
                 algorithms; waves",
}

@Proceedings{USENIX:1992:PSU,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the Summer 1992 {USENIX} Conference:
                 June 8--12, 1992, San Antonio, Texas, USA",
  title =        "Proceedings of the Summer 1992 {USENIX} Conference:
                 June 8--12, 1992, San Antonio, Texas, {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "vii + 253",
  month =        "Summer",
  year =         "1992",
  ISBN =         "1-880446-44-8",
  ISBN-13 =      "978-1-880446-44-7",
  LCCN =         "QA 76.76 O63 U83 1992",
  bibdate =      "Wed Aug 13 10:48:45 MDT 1997",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  annote =       "Spine title: San Antonio conference proceedings.",
  keywords =     "UNIX (Computer operating system) --- Congresses",
  location =     "San Antonio, TX",
}

@Proceedings{USENIX:1992:SED,
  editor =       "{USENIX}",
  booktitle =    "Symposium on Experiences with Distributed and
                 Multiprocessor Systems (SEDMS III), March 26--27, 1992.
                 Newport Beach, CA",
  title =        "Symposium on Experiences with Distributed and
                 Multiprocessor Systems ({SEDMS III}), March 26--27,
                 1992. Newport Beach, {CA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "326",
  day =          "26--27",
  month =        mar,
  year =         "1992",
  ISBN =         "1-880446-41-3",
  ISBN-13 =      "978-1-880446-41-6",
  LCCN =         "QA76.9.D3 S954 1992",
  bibdate =      "Wed Oct 16 13:53:39 2002",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  location =     "Newport Beach, CA",
}

@Proceedings{ACM:1993:CRT,
  editor =       "{ACM}",
  key =          "ACM SIGPLAN POPL '93",
  booktitle =    "Conference record of the Twentieth Annual {ACM}
                 {SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the symposium,
                 {Charleston, South Carolina}, {January} 10--13, 1993",
  title =        "Conference record of the Twentieth Annual {ACM}
                 {SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the symposium,
                 {Charleston, South Carolina}, {January} 10--13, 1993",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "viii + 510",
  year =         "1993",
  ISBN =         "0-89791-560-7 (soft cover), 0-89791-561-5 (series hard
                 cover)",
  ISBN-13 =      "978-0-89791-560-1 (soft cover), 978-0-89791-561-8
                 (series hard cover)",
  LCCN =         "QA76.7 .A15 1993",
  bibdate =      "Mon May 03 18:38:48 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM order number 549930.",
  URL =          "http://www.acm.org/pubs/contents/proceedings/plan/158511/index.html",
  acknowledgement = ack-nhfb,
  classification = "C4210 (Formal logic); C4240 (Programming and
                 algorithm theory); C6110 (Systems analysis and
                 programming); C6140D (High level languages); C6150C
                 (Compilers, interpreters and other processors); C6170
                 (Expert systems)",
  confdate =     "10-13 Jan. 1993",
  conflocation = "Charleston, SC, USA",
  confsponsor =  "ACM",
  keywords =     "Compilers; Computational complexity; electronic
                 digital computers --- programming --- congresses;
                 Functional programming; Lambda calculus; Lazy
                 evaluation; Logic programming; Object-oriented
                 languages; Parallel computing; Parametricity;
                 Polymorphism; Program testing/debugging; Programming
                 language principles; programming languages (electronic
                 computers) --- congresses; Register allocation; Typed
                 languages",
  thesaurus =    "Computational complexity; High level languages; Lambda
                 calculus; Program compilers; Programming; Programming
                 theory; Storage allocation",
}

@Proceedings{ACM:1993:PTF,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the twenty-fifth annual ACM Symposium
                 on the Theory of Computing, San Diego, California, May
                 16--18, 1993}",
  title =        "{Proceedings of the twenty-fifth annual ACM Symposium
                 on the Theory of Computing, San Diego, California, May
                 16--18, 1993}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "ix + 812",
  year =         "1993",
  ISBN =         "0-89791-591-7",
  ISBN-13 =      "978-0-89791-591-5",
  LCCN =         "QA 76.6 A13 1993",
  bibdate =      "Thu Dec 3 07:11:18 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM order no. 508930.",
  acknowledgement = ack-nhfb,
  keywords =     "computational complexity --- congresses",
}

@Proceedings{ACM:1993:TCS,
  editor =       "ACM",
  booktitle =    "TRI-Ada '93: Conference --- September 1993, Seattle,
                 WA",
  title =        "{TRI}-Ada '93: Conference --- September 1993, Seattle,
                 {WA}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "vii + 482",
  year =         "1993",
  ISBN =         "0-89791-621-2",
  ISBN-13 =      "978-0-89791-621-9",
  LCCN =         "????",
  bibdate =      "Thu Sep 04 12:56:10 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM Order No. 825930.",
  series =       "TRIADA -proceedings- 1993",
  acknowledgement = ack-nhfb,
  sponsor =      "Association for Computing Machinery; SIGAda.",
}

@Proceedings{IEEE:1993:PSP,
  editor =       "{IEEE}",
  key =          "Supercomputing'93",
  booktitle =    "Proceedings, Supercomputing '93: Portland, Oregon,
                 November 15--19, 1993",
  title =        "Proceedings, Supercomputing '93: Portland, Oregon,
                 November 15--19, 1993",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxii + 935",
  year =         "1993",
  ISBN =         "0-8186-4340-4 (paperback), 0-8186-4341-2 (microfiche),
                 0-8186-4342-0 (hardback), 0-8186-4346-3 (CD-ROM)",
  ISBN-13 =      "978-0-8186-4340-8 (paperback), 978-0-8186-4341-5
                 (microfiche), 978-0-8186-4342-2 (hardback),
                 978-0-8186-4346-0 (CD-ROM)",
  ISSN =         "1063-9535",
  LCCN =         "QA76.5 .S96 1993",
  bibdate =      "Mon Jan 15 11:06:21 1996",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  classification = "631.1; 722.1; 722.3; 722.4; 723.2; 921.6",
  keywords =     "Algorithms; Cache coherence; Clustered workstations;
                 Computer graphics; Computer networks; Computer
                 programming languages; Data parallel compilers; Data
                 partitioning; Distributed computer systems; Eigenvalues
                 and eigenfunctions; Finite element method; Flow
                 visualization; Fluid mechanics; Linear algebra; Mass
                 storage; Massively parallel processors; Natural
                 sciences computing; Parallel languages; Parallel
                 processing systems; Parallel rendering; Program
                 compilers; Quantum theory; Scheduling; Sparse matrices;
                 Supercomputers",
  sponsor =      "Institute of Electrical and Electronics Engineers;
                 Computer Society. Association for Computing Machinery;
                 SIGARCH.",
}

@Proceedings{USENIX:1993:PUMb,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the {USENIX} Mobile and
                 Location-Independent Computing Symposium: August 2--3,
                 1993, Cambridge, Massachusetts, USA",
  title =        "Proceedings of the {USENIX} Mobile and
                 Location-Independent Computing Symposium: August 2--3,
                 1993, Cambridge, Massachusetts, {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "138",
  year =         "1993",
  ISBN =         "1-880446-51-0",
  ISBN-13 =      "978-1-880446-51-5",
  LCCN =         "QA 76.76 O63 U86 1993",
  bibdate =      "Tue Oct 22 08:33:21 2002",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/mobile93/",
  acknowledgement = ack-nhfb,
  annote =       "Spine title: Mobile and Location-Independent Computing
                 Symposium, Summer 1993.",
  keywords =     "Computer networks --- Congresses; Portable computers
                 --- Communication systems --- Congresses; UNIX
                 (Computer file) --- Congresses",
}

@Proceedings{USENIX:1993:PWU,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the Winter 1993 {USENIX} Conference:
                 January 25--29, 1993, San Diego, California, {USA}",
  title =        "Proceedings of the Winter 1993 {USENIX} Conference:
                 January 25--29, 1993, San Diego, California, {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "x + 530",
  year =         "1993",
  ISBN =         "1-880446-48-0",
  ISBN-13 =      "978-1-880446-48-5",
  LCCN =         "QA 76.76 O63 U84 1993",
  bibdate =      "Sun Feb 18 07:46:09 MST 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/sd93/",
  acknowledgement = ack-nhfb,
  annote =       "Spine title: USENIX San Diego conference proceedings,
                 winter 1993. Running title: 1993 winter USENIX, January
                 25--29, 1993, San Diego, CA.",
  keywords =     "Computer networks --- Congresses; Operating systems
                 (Computers) --- Congresses; Programming (Electronic
                 computers) --- Congresses; UNIX (Computer file) ---
                 Congresses",
}

@Proceedings{ACM:1994:ASC,
  editor =       "{ACM}",
  booktitle =    "{ACM SIGPLAN '94 Conference on Programming Language
                 Design and Implementation (PLDI). Orlando, FL, USA,
                 20--24 June, 1994}",
  title =        "{ACM SIGPLAN '94 Conference on Programming Language
                 Design and Implementation (PLDI). Orlando, FL, USA,
                 20--24 June, 1994}",
  volume =       "29(6)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "360",
  month =        jun,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 24 18:36:02 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       j-SIGPLAN,
  acknowledgement = ack-nhfb,
  classification = "C4240 (Programming and algorithm theory); C6110
                 (Systems analysis and programming); C6140D (High level
                 languages); C6150C (Compilers, interpreters and other
                 processors); C6150G (Diagnostic, testing, debugging and
                 evaluating systems)",
  conftitle =    "ACM SIGPLAN '94 Conference on Programming Language
                 Design and Implementation (PLDI)",
  keywords =     "address calculation; array access errors;
                 backtracking; cache performance; CLP; code replication;
                 compilation techniques; continuation passing; garbage
                 collected programs; high level languages; jump
                 debugging; jump statements; lazy functional state
                 threads; link-time optimisation; memory access
                 coalescing; optimal tracing; optimisation; partial dead
                 code elimination; pointer-based data structures;
                 Presburger Formulas; program analysis tools; program
                 compilers; program debugging; program optimisation;
                 program structure tree; programming; programming
                 language design; programming theory; programming theory
                 program debugging; Prolog; register allocation; slicing
                 programs; Standard ML; type analysis; zero-cost range
                 splitting",
  sponsororg =   "ACM",
  treatment =    "P Practical; T Theoretical or Mathematical",
}

@Proceedings{ACM:1994:CRP,
  editor =       "{ACM}",
  booktitle =    "Conference record of {POPL} '94, 21st {ACM
                 SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the Symposium: Portland,
                 Oregon, January 17--21, 1994",
  title =        "Conference record of {POPL} '94, 21st {ACM
                 SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the Symposium: Portland,
                 Oregon, January 17--21, 1994",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "viii + 492",
  year =         "1994",
  ISBN =         "0-89791-636-0",
  ISBN-13 =      "978-0-89791-636-3",
  LCCN =         "QA76.7 .A15 1994",
  bibdate =      "Sat Sep 7 07:51:54 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.acm.org/pubs/contents/proceedings/plan/174675/index.html",
  abstract =     "The following topics were dealt with: programming
                 language principles; OOP; type theory; program
                 correctness; lambda calculus; garbage collection; logic
                 programming; scheduling; data flow graphs; functional
                 programming; and continuation passing.",
  acknowledgement = ack-nhfb,
  classification = "C4210 (Formal logic); C4240 (Programming and
                 algorithm theory); C6110J (Object-oriented
                 programming); C6120 (File organisation); C6140D (High
                 level languages); C6150C (Compilers, interpreters and
                 other processors)",
  confdate =     "17--21 Jan. 1994",
  conflocation = "Portland, OR, USA",
  confsponsor =  "ACM",
  keywords =     "Continuation passing; Data flow graphs; Functional
                 programming; Garbage collection; Lambda calculus; Logic
                 programming; OOP; Program correctness; Programming
                 language principles; Scheduling; Type theory",
  thesaurus =    "High level languages; Lambda calculus; Object-oriented
                 programming; Program compilers; Program verification;
                 Storage management; Type theory",
}

@Proceedings{ACM:1994:IPI,
  editor =       "{ACM}",
  booktitle =    "{ISSAC '94: Proceedings of the 1994 International
                 Symposium on Symbolic and Algebraic Computation: July
                 20--22, 1994, Oxford, England, United Kingdom}",
  title =        "{ISSAC '94: Proceedings of the 1994 International
                 Symposium on Symbolic and Algebraic Computation: July
                 20--22, 1994, Oxford, England, United Kingdom}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "ix + 359",
  year =         "1994",
  ISBN =         "0-89791-638-7",
  ISBN-13 =      "978-0-89791-638-7",
  LCCN =         "QA76.95.I59 1994",
  bibdate =      "Thu Sep 26 05:45:15 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  confdate =     "20--22 July 1994",
  conflocation = "Oxford, UK",
  confsponsor =  "ACM",
  pubcountry =   "USA",
}

@Proceedings{ACM:1994:SIC,
  editor =       "ACM",
  booktitle =    "{Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}",
  title =        "{Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}",
  volume =       "29(11)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "328",
  month =        nov,
  year =         "1994",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 24 18:36:02 MDT 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       j-SIGPLAN,
  acknowledgement = ack-nhfb,
  classification = "C5220 (Computer architecture); C6140 (Programming
                 languages); C6150J (Operating systems)",
  conflocation = "",
  conftitle =    "Sixth International Conference on Architectural
                 Support for Programming Languages and Operating Systems
                 (ASPLOS-VI)",
  keywords =     "architectural support; code transformation; computer
                 architecture; instrumentation; measurement; memory
                 access; multithreading; operating systems; operating
                 systems (computers); parallel machines; programming
                 languages; shares memory multiprocessors; uniprocessor
                 performance",
  sponsororg =   "ACM; IEEE Comput. Soc",
}

@Proceedings{Anonymous:1994:ICS,
  editor =       "Anonymous",
  booktitle =    "1994 International Computer Symposium Conference
                 Proceedings",
  title =        "1994 International Computer Symposium Conference
                 Proceedings",
  publisher =    "Nat. Chiao Tung Univ",
  address =      "Hsinchu, Taiwan",
  pages =        "xvi + 1310",
  year =         "1994",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "2 vol.",
  acknowledgement = ack-nhfb,
  confdate =     "12--15 Dec. 1994",
  conflocation = "Hsinchu, Taiwan",
  confsponsor =  "Ministr. Educ.; Comput. Soc",
  pubcountry =   "Taiwan",
}

@Proceedings{Anonymous:1994:PIW,
  editor =       "Anonymous",
  booktitle =    "Proceedings of the 2nd International World Wide Web
                 conference, Mosaic and the Web, October 1994,
                 Ramada-Congress Hotel, 520 South Michigan Avenue,
                 Chicago, IL",
  title =        "Proceedings of the 2nd International World Wide Web
                 conference, Mosaic and the Web, October 1994,
                 Ramada-Congress Hotel, 520 South Michigan Avenue,
                 Chicago, {IL}",
  volume =       "18(6)",
  publisher =    pub-LEARNED-INF,
  address =      pub-LEARNED-INF:adr,
  pages =        "????",
  year =         "1994",
  CODEN =        "ONCDEW",
  ISSN =         "0309-314X",
  bibdate =      "Sun Oct 22 08:43:14 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       j-ONLINE-CDROM-REV,
  URL =          "http://www.ncsa.uiuc.edu/SDG/IT94/Proceedings/WWW2_Proceedings.html",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:1994:USC,
  editor =       "Anonymous",
  booktitle =    "USENIX Summer conference: --- June 1994, Boston, MA",
  title =        "{USENIX} Summer conference: -- June 1994, Boston,
                 {MA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "316",
  year =         "1994",
  ISBN =         "1-880446-62-6",
  ISBN-13 =      "978-1-880446-62-1",
  LCCN =         "QA 76.76 O63 U83 1994",
  bibdate =      "Sat May 25 07:59:58 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "USENIX Conference Proceedings 1994",
  acknowledgement = ack-nhfb,
}

@Proceedings{Goldwasser:1994:PAS,
  editor =       "Shafi Goldwasser",
  booktitle =    "Proceedings: 35th Annual Symposium on Foundations of
                 Computer Science, November 20--22, 1994, Santa Fe, New
                 Mexico",
  title =        "Proceedings: 35th Annual Symposium on Foundations of
                 Computer Science, November 20--22, 1994, Santa Fe, New
                 Mexico",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 837",
  year =         "1994",
  CODEN =        "ASFPDV",
  ISBN =         "0-8186-6582-3",
  ISBN-13 =      "978-0-8186-6582-0",
  ISSN =         "0272-5428",
  LCCN =         "QA 76 S979 1994",
  bibdate =      "Thu Dec 3 07:11:18 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 94CH35717. IEEE Computer Society
                 Press Order Number 6580-02.",
  acknowledgement = ack-nhfb,
  keywords =     "electronic data processing --- congresses",
}

@Proceedings{Hong:1994:FIS,
  editor =       "Hoon Hong",
  booktitle =    "{First International Symposium on Parallel Symbolic
                 Computation, PASCO '94, Hagenberg\slash Linz, Austria,
                 September 26--28, 1994}",
  title =        "{First International Symposium on Parallel Symbolic
                 Computation, PASCO '94, Hagenberg\slash Linz, Austria,
                 September 26--28, 1994}",
  volume =       "5",
  publisher =    pub-WORLD-SCI,
  address =      pub-WORLD-SCI:adr,
  pages =        "xiii + 431",
  year =         "1994",
  ISBN =         "981-02-2040-5",
  ISBN-13 =      "978-981-02-2040-2",
  LCCN =         "QA76.642.I58 1994",
  bibdate =      "Thu Mar 12 07:55:38 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/issac.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Lecture notes series in computing",
  acknowledgement = ack-nhfb,
  alttitle =     "Parallel symbolic computation",
  keywords =     "Parallel programming (Computer science) ---
                 Congresses.",
}

@Proceedings{IEEE:1994:PIW,
  editor =       "{IEEE}",
  booktitle =    "Proceedings 11th IEEE Workshop on Real-Time Operating
                 Systems and Software. RTOSS '94, Seattle, WA, USA,
                 18--19 May 1994",
  title =        "Proceedings 11th {IEEE} Workshop on Real-Time
                 Operating Systems and Software. {RTOSS} '94, Seattle,
                 {WA}, {USA}, 18--19 May 1994",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "viii + 117",
  year =         "1994",
  ISBN =         "0-8186-5710-3",
  ISBN-13 =      "978-0-8186-5710-8",
  LCCN =         "QA76.54.I173 1994",
  bibdate =      "Sat Sep 28 18:52:45 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 94TH0639-5.",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE",
}

@Proceedings{IEEE:1994:PSH,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the Scalable High-Performance
                 Computing Conference, May 23--25, 1994, Knoxville,
                 Tennessee}",
  title =        "{Proceedings of the Scalable High-Performance
                 Computing Conference, May 23--25, 1994, Knoxville,
                 Tennessee}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 852",
  year =         "1994",
  ISBN =         "0-8186-5680-8, 0-8186-5681-6",
  ISBN-13 =      "978-0-8186-5680-4, 978-0-8186-5681-1",
  LCCN =         "QA76.5 .S244 1994",
  bibdate =      "Mon Aug 26 10:38:41 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 94TH0637-9.",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE Computer Society; Technical Committee on
                 Supercomputing Applications.",
}

@Proceedings{IEEE:1994:PSW,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings, Supercomputing '94: Washington, DC,
                 November 14--18, 1994}",
  title =        "{Proceedings, Supercomputing '94: Washington, DC,
                 November 14--18, 1994}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvii + 823",
  year =         "1994",
  ISBN =         "0-8186-6605-6 (paper), 0-8186-6606-4 (microfiche),
                 0-8186-6607-2 (case)",
  ISBN-13 =      "978-0-8186-6605-6 (paper), 978-0-8186-6606-3
                 (microfiche), 978-0-8186-6607-0 (case)",
  ISSN =         "1063-9535",
  LCCN =         "QA76.5 .S894 1994",
  bibdate =      "Fri Aug 30 08:01:51 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE catalog number 94CH34819.",
  series =       "Supercomputing",
  acknowledgement = ack-nhfb,
  keywords =     "Supercomputers --- Congresses",
  sponsor =      "IEEE.",
}

@Proceedings{IEEE:1994:ROS,
  editor =       "IEEE",
  booktitle =    "Real-time operating systems and software: RTOSS '94:
                 11th Workshop --- May 1994, Seattle, WA",
  title =        "Real-time operating systems and software: {RTOSS} '94:
                 11th Workshop --- May 1994, Seattle, {WA}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "viii + 117",
  year =         "1994",
  ISBN =         "0-8186-5710-3",
  ISBN-13 =      "978-0-8186-5710-8",
  LCCN =         "QA76.54.I173 1994",
  bibdate =      "Sat May 25 07:59:58 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "IEEE Workshop on Real Time Operating Systems and
                 Software 1994; 11th",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE; Computer Society; Technical Committee on
                 Real-Time Systems.",
}

@Proceedings{ACM:1995:CPI,
  editor =       "ACM",
  booktitle =    "Conference proceedings of the 1995 International
                 Conference on Supercomputing, Barcelona, Spain, July
                 3--7, 1995",
  title =        "Conference proceedings of the 1995 International
                 Conference on Supercomputing, Barcelona, Spain, July
                 3--7, 1995",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xii + 448",
  year =         "1995",
  ISBN =         "0-89791-728-6",
  ISBN-13 =      "978-0-89791-728-5",
  LCCN =         "QA 76.88 I57 1995",
  bibdate =      "Mon Dec 23 18:50:57 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "Conference Proceedings of the International Conference
                 on Supercomputing",
  acknowledgement = ack-nhfb,
  sponsor =      "Association for Computing Machinery. Special Interest
                 Group on Computer Architecture.",
}

@Proceedings{ACM:1995:CRP,
  editor =       "{ACM}",
  booktitle =    "Conference record of {POPL} '95, 22nd {ACM}
                 {SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the Symposium: San
                 Francisco, California, January 22--25, 1995",
  title =        "Conference record of {POPL} '95, 22nd {ACM}
                 {SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the Symposium: San
                 Francisco, California, January 22--25, 1995",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "vii + 408",
  year =         "1995",
  ISBN =         "0-89791-692-1",
  ISBN-13 =      "978-0-89791-692-9",
  LCCN =         "QA 76.7 A11 1995",
  bibdate =      "Mon May 3 17:47:49 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM order number: 549950.",
  URL =          "http://www.acm.org/pubs/contents/proceedings/plan/199448/index.html",
  acknowledgement = ack-nhfb,
  alttitle =     "Proceedings, 22nd ACM SIGPLAN-SIGACT Symposium on
                 Principles of Programming Languages POPL '95",
  annote =       "Sponsored by the Association for Computing Machinery,
                 Special Interest Group on Algorithms and Computation
                 Theory (SIGACT), Special Interest Group on Programming
                 Languages (SIGPLAN).",
  keywords =     "Programming languages (Electronic computers) --
                 Congresses.",
}

@Proceedings{Ferreira:1995:PAI,
  editor =       "Afonso Ferreira and Jose Rolim",
  booktitle =    "{Parallel algorithms for irregularly structured
                 problems: second international workshop, IRREGULAR 95,
                 Lyon, France, September, 4--6, 1995: proceedings}",
  title =        "{Parallel algorithms for irregularly structured
                 problems: second international workshop, IRREGULAR 95,
                 Lyon, France, September, 4--6, 1995: proceedings}",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 409",
  year =         "1995",
  ISBN =         "3-540-60321-2",
  ISBN-13 =      "978-3-540-60321-4",
  LCCN =         "QA76.642.I59 1995",
  bibdate =      "Sun Dec 22 10:19:23 MST 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  acknowledgement = ack-nhfb,
  confsponsor =  "IFIP",
  pubcountry =   "Germany",
}

@Proceedings{IEEE:1995:PCL,
  editor =       "{IEEE Computer Society. Technical Committee on
                 Computer Communications}",
  booktitle =    "Proceedings: 20th Conference on Local Computer
                 Networks, October 16--19, 1995, Minneapolis,
                 Minnesota",
  title =        "Proceedings: 20th Conference on Local Computer
                 Networks, October 16--19, 1995, Minneapolis,
                 Minnesota",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 496",
  year =         "1995",
  ISBN =         "0-8186-7163-7 (microfiche), 0-8186-7162-9",
  ISBN-13 =      "978-0-8186-7163-0 (microfiche), 978-0-8186-7162-3",
  LCCN =         "TK5105.7 .C66 1995 Bar",
  bibdate =      "Mon Sep 27 06:55:07 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "IEEE Computer Society Press order number PR07162. IEEE
                 catalog number 95TB100005",
  acknowledgement = ack-nhfb,
  keywords =     "local area networks (computer networks) --
                 congresses",
}

@Proceedings{ACM:1996:FCP,
  editor =       "{ACM}",
  booktitle =    "FCRC '96: Conference proceedings of the 1996
                 International Conference on Supercomputing:
                 Philadelphia, Pennsylvania, {USA}, May 25--28, 1996",
  title =        "{FCRC} '96: Conference proceedings of the 1996
                 International Conference on Supercomputing:
                 Philadelphia, Pennsylvania, {USA}, May 25--28, 1996",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xii + 406",
  year =         "1996",
  ISBN =         "0-89791-803-7",
  ISBN-13 =      "978-0-89791-803-9",
  LCCN =         "QA76.5 I61 1996",
  bibdate =      "Wed Mar 18 12:33:29 MST 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM order number 415961.",
  acknowledgement = ack-nhfb,
  keywords =     "Supercomputers --- Congresses.",
}

@Proceedings{IEEE:1996:PSM,
  editor =       "{IEEE}",
  booktitle =    "Proceedings. Second MPI Developer's Conference: Notre
                 Dame, IN, USA, 1--2 July 1996",
  title =        "Proceedings. Second {MPI} Developer's Conference:
                 Notre Dame, {IN}, {USA}, 1--2 July 1996",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 207",
  year =         "1996",
  ISBN =         "0-8186-7533-0",
  ISBN-13 =      "978-0-8186-7533-1",
  LCCN =         "QA76.642 .M67 1996",
  bibdate =      "Tue May 12 08:56:04 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
}

@Proceedings{LakshmanYN:1996:IPI,
  editor =       "{Lakshman Y. N.}",
  booktitle =    "{ISSAC '96: Proceedings of the 1996 International
                 Symposium on Symbolic and Algebraic Computation, July
                 24--26, 1996, Zurich, Switzerland}",
  title =        "{ISSAC '96: Proceedings of the 1996 International
                 Symposium on Symbolic and Algebraic Computation, July
                 24--26, 1996, Zurich, Switzerland}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xvii + 313",
  year =         "1996",
  ISBN =         "0-89791-796-0",
  ISBN-13 =      "978-0-89791-796-4",
  LCCN =         "QA 76.95 I59 1996",
  bibdate =      "Thu Mar 12 08:00:14 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/issac.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  sponsor =      "ACM; Special Interest Group in Symbolic and Algebraic
                 Manipulation (SIGSAM). ACM; Special Interest Group on
                 Numerical Mathematics (SIGNUM).",
}

@Proceedings{Szymanski:1996:LCR,
  editor =       "Boleslaw K. Szymanski and Balaram Sinharoy",
  booktitle =    "Languages, Compilers and Run-Time Systems for Scalable
                 Computers, Troy, NY, USA, May 22--24, 1995",
  title =        "Languages, Compilers and Run-Time Systems for Scalable
                 Computers, Troy, {NY}, {USA}, May 22--24, 1995",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xiv + 335",
  year =         "1996",
  ISBN =         "0-7923-9635-9",
  ISBN-13 =      "978-0-7923-9635-2",
  LCCN =         "QA76.58.L37 1996",
  bibdate =      "Sat Sep 28 18:12:58 MDT 1996",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/mach.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:1996:ATT,
  editor =       "{USENIX} Association",
  booktitle =    "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996.
                 Monterey, CA",
  title =        "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996.
                 Monterey, {CA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "????",
  day =          "10--13",
  month =        jul,
  year =         "1996",
  ISBN =         "1-880446-78-2",
  ISBN-13 =      "978-1-880446-78-2",
  LCCN =         "QA76.73.T44 T44 1996",
  bibdate =      "Fri Oct 18 07:24:24 MDT 1996",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  location =     "Monterey, CA",
}

@Proceedings{USENIX:1996:PFA,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the fourth annual Tcl\slash Tk
                 Workshop, July 10--13, 1996, Monterey, California",
  title =        "Proceedings of the fourth annual Tcl\slash Tk
                 Workshop, July 10--13, 1996, Monterey, California",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "235",
  year =         "1996",
  ISBN =         "1-880446-78-2",
  ISBN-13 =      "978-1-880446-78-2",
  LCCN =         "QA 76.73 T44 T35 1996",
  bibdate =      "Mon May 11 11:50:25 1998",
  bibsource =    "ftp://ftp.uu.net/library/bibliography;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/tcl96/",
  acknowledgement = ack-nhfb,
  location =     "Monterey, CA",
}

@Proceedings{IEEE:1997:APD,
  editor =       "{IEEE}",
  booktitle =    "Advances in parallel and distributed computing: March
                 19--21, 1997, Shanghai, China: proceedings",
  title =        "Advances in parallel and distributed computing: March
                 19--21, 1997, Shanghai, China: proceedings",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 426",
  year =         "1997",
  ISBN =         "0-8186-7876-3 (paperback and case), 0-8186-7878-X
                 (microfiche)",
  ISBN-13 =      "978-0-8186-7876-9 (paperback and case),
                 978-0-8186-7878-3 (microfiche)",
  LCCN =         "QA76.58 .A4 1997",
  bibdate =      "Wed Apr 16 07:34:31 MDT 1997",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "electronic data processing -- distributed processing
                 -- congresses; parallel processing (electronic
                 computers) -- congresses",
}

@Proceedings{ACM:1998:AWJ,
  editor =       "{ACM}",
  booktitle =    "ACM 1998 Workshop on Java for High-Performance Network
                 Computing",
  title =        "{ACM} 1998 Workshop on Java for High-Performance
                 Network Computing",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "1998",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu Apr 27 10:40:59 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "Possibly unpublished, except electronically.",
  URL =          "http://www.cs.ucsb.edu/conferences/java98/program.html",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1998:CRP,
  editor =       "ACM",
  booktitle =    "Conference record of POPL '98: the 25th ACM
                 SIGPLAN-SIGACT Symposium on Principles of Programming
                 Languages: papers presented at the Symposium, San
                 Diego, California, 19--21 January 1998",
  title =        "Conference record of {POPL} '98: the 25th {ACM}
                 {SIGPLAN-SIGACT} Symposium on Principles of Programming
                 Languages: papers presented at the Symposium, San
                 Diego, California, 19--21 January 1998",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "viii + 408",
  year =         "1998",
  ISBN =         "0-89791-979-3",
  ISBN-13 =      "978-0-89791-979-1",
  LCCN =         "QA76.7 .A15 1998",
  bibdate =      "Mon May 3 17:47:49 MDT 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  note =         "ACM order number: 549981.",
  URL =          "http://www.acm.org/pubs/contents/proceedings/plan/268946/index.html",
  acknowledgement = ack-nhfb,
  alttitle =     "POPL '98 ACM SIGPLAN-SIGACT Symposium on Principles of
                 Programming Languages Principles of programming
                 languages Proceedings 25th ACM SIGPLAN-SIGACT Symposium
                 on Principles of Programming Languages",
  keywords =     "Electronic digital computers -- Programming --
                 Congresses.; Programming languages (Electronic
                 computers) -- Congresses.",
}

@Proceedings{ACM:1998:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings: the 25th Annual International Symposium
                 on Computer Architecture, June 27--July 1, 1998,
                 Barcelona, Spain}",
  title =        "{Proceedings: the 25th Annual International Symposium
                 on Computer Architecture, June 27--July 1, 1998,
                 Barcelona, Spain}",
  volume =       "26(3)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xiii + 394",
  year =         "1998",
  ISBN =         "0-8186-8491-7, 0-8186-8492-5, 0-8186-8493-3",
  ISBN-13 =      "978-0-8186-8491-3, 978-0-8186-8492-0,
                 978-0-8186-8493-7",
  LCCN =         "QA76.9.A73 S97 1998",
  bibdate =      "Fri May 12 12:36:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  note =         "ACM Order Number 414984. IEEE Computer Society Order
                 Number PR08491; IEEE Order Plan Catalog Number
                 98CB36235.",
  series =       "Computer architecture news",
  URL =          "http://portal.acm.org/toc.cfm?id=279358;
                 http://portal.acm.org/toc.cfm?id=285930",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '25 proceedings.",
}

@Proceedings{ACM:1998:SHP,
  editor =       "{ACM}",
  booktitle =    "SC'98: High Performance Networking and Computing:
                 Proceedings of the 1998 ACM\slash IEEE SC98 Conference:
                 Orange County Convention Center, Orlando, Florida, USA,
                 November 7--13, 1998",
  title =        "{SC}'98: High Performance Networking and Computing:
                 Proceedings of the 1998 {ACM}\slash {IEEE} {SC98}
                 Conference: Orange County Convention Center, Orlando,
                 Florida, {USA}, November 7--13, 1998",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1998",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Oct 07 08:51:34 1998",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:1998:PSA,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the sixth annual Tcl/Tk Conference,
                 September 18--24 [i.e. 14--18], 1998, San Diego,
                 California",
  title =        "Proceedings of the sixth annual Tcl/Tk Conference,
                 September 18--24 [i.e. 14--18], 1998, San Diego,
                 California",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "206",
  year =         "1998",
  ISBN =         "1-880446-98-7",
  ISBN-13 =      "978-1-880446-98-0",
  LCCN =         "QA76.73.T44 T34 1998",
  bibdate =      "Fri Oct 18 08:12:11 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://db.usenix.org/publications/library/proceedings/tcl98/",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:1998:PUWa,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the 2nd {USENIX Windows NT} Symposium:
                 August 3--5, 1998, Seattle, Washington",
  title =        "Proceedings of the 2nd {USENIX Windows NT} Symposium:
                 August 3--5, 1998, Seattle, Washington",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "173",
  year =         "1998",
  ISBN =         "1-880446-95-2",
  ISBN-13 =      "978-1-880446-95-9",
  LCCN =         "QA76.76.O63 U885 1998",
  bibdate =      "Fri Oct 29 08:40:21 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://db.usenix.org/publications/library/proceedings/usenix-nt98",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1999:PASa,
  editor =       "ACM",
  booktitle =    "Proceedings of the ACM SIGPLAN '99 Conference on
                 Programming Language Design and Implementation (PLDI
                 '99), Atlanta, Georgia, 2--4 May 1999",
  title =        "Proceedings of the {ACM} {SIGPLAN} '99 Conference on
                 Programming Language Design and Implementation ({PLDI}
                 '99), Atlanta, Georgia, 2--4 May 1999",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "1999",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Thu May 13 14:45:29 1999",
  bibsource =    "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html;
                 http://www.acm.org/pubs/contents/proceedings/pldi/301618/index.html;
                 http://www.cs.rutgers.edu/pldi99/program.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1999:SPO,
  editor =       "{ACM}",
  booktitle =    "SC'99: Oregon Convention Center 777 NE Martin Luther
                 King Jr. Boulevard, Portland, Oregon, November 11--18,
                 1999",
  title =        "{SC}'99: Oregon Convention Center 777 {NE} Martin
                 Luther King Jr. Boulevard, Portland, Oregon, November
                 11--18, 1999",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1999",
  ISBN =         "",
  ISBN-13 =      "",
  LCCN =         "",
  bibdate =      "Thu Feb 24 09:35:00 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Atkinson:1999:PTF,
  editor =       "Malcolm P. Atkinson and Maria E. Orlowska and Patrick
                 Valduriez and Stanley B. Zdonik and Michael L. Brodie",
  booktitle =    "Proceedings of the Twenty-fifth International
                 Conference on Very Large Databases, Edinburgh,
                 Scotland, UK, 7--10 September, 1999",
  title =        "Proceedings of the Twenty-fifth International
                 Conference on Very Large Databases, Edinburgh,
                 Scotland, {UK}, 7--10 September, 1999",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xviii + 761",
  year =         "1999",
  ISBN =         "1-55860-615-7",
  ISBN-13 =      "978-1-55860-615-9",
  LCCN =         "QA76.9.D3 I559 1999",
  bibdate =      "Tue Oct 24 18:36:50 MDT 2000",
  bibsource =    "DBLP; http://dblp.uni-trier.de;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/vldb.bib; OCLC
                 Proceedings database",
  note =         "Also known as VLDB'99",
  acknowledgement = ack-nhfb,
  keywords =     "very large data bases; VLDB",
}

@Proceedings{Dongarra:1999:RAP,
  editor =       "J. J. Dongarra and E. Luque and Tomas Margalef",
  booktitle =    "{Recent advances in parallel virtual machine and
                 message passing interface: 6th European PVM\slash {MPI}
                 Users' Group Meeting, Barcelona, Spain, September
                 26--29, 1999: Proceedings}",
  title =        "{Recent advances in parallel virtual machine and
                 message passing interface: 6th European PVM\slash {MPI}
                 Users' Group Meeting, Barcelona, Spain, September
                 26--29, 1999: Proceedings}",
  volume =       "1697",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xvii + 551",
  year =         "1999",
  CODEN =        "LNCSD9",
  DOI =          "????",
  ISBN =         "3-540-66549-8 (softcover)",
  ISBN-13 =      "978-3-540-66549-6 (softcover)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 E973 1999",
  bibdate =      "Wed Dec 8 06:34:56 MST 1999",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       ser-LNCS,
  URL =          "http://link.springer-ny.com/link/service/series/0558/tocs/t1697.htm;
                 http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=1697",
  acknowledgement = ack-nhfb,
  alttitle =     "PVM\slash MPI '99",
  keywords =     "Data transmission systems; Parallel computers; Virtual
                 computer systems",
}

@Proceedings{IEEE:1999:HCS,
  editor =       "IEEE",
  booktitle =    "Hot Chips 11: Stanford University, Stanford,
                 California, August 15--17, 1999",
  title =        "Hot Chips 11: Stanford University, Stanford,
                 California, August 15--17, 1999",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "1999",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Mon Jan 08 05:26:43 2001",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.hotchips.org/hotc11_index.html",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2000:SHP,
  editor =       "{ACM}",
  booktitle =    "SC2000: High Performance Networking and Computing.
                 Dallas Convention Center, Dallas, TX, USA, November
                 4--10, 2000",
  title =        "{SC2000}: High Performance Networking and Computing.
                 Dallas Convention Center, Dallas, {TX}, {USA}, November
                 4--10, 2000",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2000",
  ISBN =         "",
  ISBN-13 =      "",
  LCCN =         "",
  bibdate =      "Thu Feb 24 09:35:00 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  URL =          "http://www.sc2000.org/proceedings/info/fp.pdf",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:2000:CCI,
  editor =       "Anonymous",
  booktitle =    "Cool Chips III: An International Symposium on
                 Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan,
                 Tokyo, Japan April 24--25, 2000",
  title =        "Cool Chips {III}: An International Symposium on
                 Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan,
                 Tokyo, Japan April 24--25, 2000",
  publisher =    "????",
  address =      "????",
  pages =        "????",
  year =         "2000",
  ISBN =         "",
  ISBN-13 =      "",
  LCCN =         "",
  bibdate =      "Mon Jan 08 09:19:21 2001",
  bibsource =    "http://www.coolchips.org/index-cool3.html;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Book{Koniges:2000:ISP,
  editor =       "Alice E. Koniges",
  booktitle =    "Industrial Strength Parallel Computing",
  title =        "Industrial Strength Parallel Computing",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adr,
  pages =        "xxv + 597",
  year =         "2000",
  ISBN =         "1-55860-540-1",
  ISBN-13 =      "978-1-55860-540-4",
  LCCN =         "QA76.58 .I483 2000",
  bibdate =      "Fri Feb 04 18:30:40 2000",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:2000:PUT,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of the 7th USENIX Tcl\slash Tk Conference
                 (Tcl/2k): February 14--18, 2000, Austin, Texas, USA",
  title =        "Proceedings of the 7th {USENIX} Tcl\slash Tk
                 Conference (Tcl/2k): February 14--18, 2000, Austin,
                 Texas, {USA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "194",
  year =         "2000",
  ISBN =         "1-880446-24-3",
  ISBN-13 =      "978-1-880446-24-9",
  LCCN =         "????",
  bibdate =      "Wed Oct 16 09:54:12 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://db.usenix.org/publications/library/proceedings/tcl2k/",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:2000:UAT,
  editor =       "{USENIX}",
  booktitle =    "2000 USENIX Annual Technical Conference: San Diego,
                 CA, USA, June 18--23, 2000",
  title =        "2000 {USENIX} Annual Technical Conference: San Diego,
                 {CA}, {USA}, June 18--23, 2000",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "350",
  year =         "2000",
  ISBN =         "1-880446-22-7",
  ISBN-13 =      "978-1-880446-22-5",
  LCCN =         "????",
  bibdate =      "Mon Oct 14 07:43:52 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/usenix2000",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2001:PAJ,
  editor =       "{ACM}",
  booktitle =    "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE
                 Conference: Palo Alto, Calif., June 2--4, 2001}",
  title =        "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE
                 Conference: Palo Alto, Calif., June 2--4, 2001}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "vi + 186",
  year =         "2001",
  ISBN =         "1-58113-359-6",
  ISBN-13 =      "978-1-58113-359-2",
  LCCN =         "QA76.9.O35 A26 2001",
  bibdate =      "Mon May 6 06:26:30 MDT 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
  keywords =     "Java (computer program language) -- congresses;
                 object-oriented methods (computer science) --
                 congresses",
}

@Proceedings{Boisvert:2001:ASS,
  editor =       "Ronald F. Boisvert and Ping Tak Peter Tang",
  booktitle =    "The architecture of scientific software: {IFIP
                 TC2/WG2.5 Working Conference on the Architecture of
                 Scientific Software, October 2--4, 2000, Ottawa,
                 Canada}",
  title =        "The architecture of scientific software: {IFIP
                 TC2/WG2.5 Working Conference on the Architecture of
                 Scientific Software, October 2--4, 2000, Ottawa,
                 Canada}",
  volume =       "60",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "xx + 358",
  year =         "2001",
  ISBN =         "0-7923-7339-1",
  ISBN-13 =      "978-0-7923-7339-1",
  LCCN =         "QA76.758 .I345 2000",
  bibdate =      "Fri May 27 08:46:38 2005",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       "IFIP",
  acknowledgement = ack-nhfb,
  tableofcontents = "Preface (p. ix)\\
                 Contributing Authors (p. xv)\\
                 Part I: Large-Scale Systems Integration\\
                 Network-Based Scientific Computing: Elias N. Houstis,
                 Ann Christine Catlin, Ganesh Balakrishnan, Nitesh
                 Dhanjani, GaHyun Park, John R. Rice, Spyros Lalis,
                 Manolis Stamatogiannakis, Catherine E. Houstis (pp.
                 3--28) \\
                 Future Generations of Problem-Solving Environments:
                 Jos{\'e} C. Cunha (pp. 29--38) \\
                 Developing an Architecture to Support the
                 Implementation and Development of Scientific computing
                 Applications: Dorian C. Arnold, Jack Dongarra (pp.
                 39--56) \\
                 PETSc and Overture: Lessons Learned Developing an
                 Interface between Components: Kristopher R. Buschelman,
                 William Gropp, Lois C. McInnes, Barry F. Smith (pp.
                 57--68) \\
                 Component Technology for High-Performance Scientific
                 Simulation Software: Tom Epperly, Scott R. Kohn, Gary
                 Kumfert (pp. 69--86) \\
                 A New Approach to Software Integration Frameworks for
                 Multi-physics Simulation Codes: Eric de Sturler, Jay
                 Hoeflinger, Laxmikant V. Kal{\'e}, Milind Bhandarkar
                 (pp. 87--104) \\
                 Code Coupling using Parallel CORBA Objects: Christophe
                 Ren{\'e}, Thierry Priol, Guillaume All{\'e}on (pp.
                 105--118) \\
                 A Collaborative Code Development Environment for
                 Computational Electro-magnetics: Matthew S. Shields,
                 Omer F. Rana, David W. Walker, David Colby (pp.
                 119--144) \\
                 Part II: The Architecture of Components\\
                 On the Role of Mathematical Abstractions for Scientific
                 Computing: Krister {\AA}hlander, Magne Haveraaen, Hans
                 Z. Munthe-Kaas (pp. 145--158) \\
                 Object-oriented Modeling of Parallel PDE Solvers:
                 Michael Thun{\'e}, Krister {\AA}hlander, Malin
                 Ljungberg, Markus Nord{\'e}n, Kurt Otto, Jarmo
                 Rantakokko (pp. 159--174) \\
                 Broadway: A Software Architecture for Scientific
                 Computing: Samuel Z. Guyer, Calvin Lin (pp. 175--192)
                 \\
                 Formal Methods for High-Performance Linear Algebra
                 Libraries: John A. Gunnels, Robert A. van de Geijn (pp.
                 193--210) \\
                 New Generalized Matrix Data Structures Lead to a
                 Variety of High-Performance Algorithms: Fred G.
                 Gustavson (pp. 211--234) \\
                 A Comprehensive DFT API for Scientific Computing: Ping
                 Tak Peter Tang (pp. 235--256) \\
                 Using A Fortran Interface to POSIX Threads: Richard J.
                 Hanson, Clay P. Breshears, Henry A. Gabb (pp. 257--272)
                 \\
                 Data Management Systems for Scientific Applications:
                 Reagan Moore (pp. 273--284) \\
                 Software Components for Application Development: Arnaud
                 Desitter, Antoine Le Hyaric, Geoff Morgan, Gareth Shaw,
                 Anne E. Trefethen (pp. 285--300) \\
                 Hierarchical Representation and Computation of
                 Approximate Solutions in Scientific Simulations: Wayne
                 H. Enright (pp. 301--316) \\
                 Software Architecture for the Investigation of
                 Controllable Models with Complex Data Sets: Dmitry
                 Belyshev, Vladimir I. Gurman (pp. 317--332) \\
                 A Mixed-Language Programming Methodology for High
                 Performance Java Computing: Vladimir Getov (pp.
                 333--350) \\
                 Part III: Conference Information\\
                 The Architecture of Scientific Software: the Conference
                 (pp. 351--356)\\
                 Index (pp. 357--358)",
}

@Proceedings{Eigenmann:2001:OSM,
  editor =       "Rudolf Eigenmann and Michael J. Voss",
  booktitle =    "{OpenMP} shared memory parallel programming:
                 International Workshop on {OpenMP} Applications and
                 Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July
                 30--31, 2001: proceedings",
  title =        "{OpenMP} shared memory parallel programming:
                 International Workshop on {OpenMP} Applications and
                 Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July
                 30--31, 2001: proceedings",
  volume =       "2104",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 184",
  year =         "2001",
  ISBN =         "3-540-42346-X (paperback)",
  ISBN-13 =      "978-3-540-42346-1 (paperback)",
  LCCN =         "QA76.642 .I589 2001; QA267.A1 L43 no.2104",
  bibdate =      "Thu Jan 17 11:49:19 MST 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       ser-LNCS,
  URL =          "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm",
  acknowledgement = ack-nhfb,
  keywords =     "parallel programming (computer science) --
                 congresses",
}

@Proceedings{USENIX:2001:PJV,
  editor =       "USENIX",
  booktitle =    "Proceedings of the Java Virtual Machine Research and
                 Technology Sy[m]posium (JVM '01): April 23--24, 2001,
                 Monterey, California, USA. Berkeley, CA",
  title =        "Proceedings of the Java Virtual Machine Research and
                 Technology Sy[m]posium ({JVM} '01): April 23--24, 2001,
                 Monterey, California, {USA}. Berkeley, {CA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "232",
  year =         "2001",
  ISBN =         "1-880446-11-1",
  ISBN-13 =      "978-1-880446-11-9",
  LCCN =         "QA76.73.J38 J42 2001",
  bibdate =      "Tue Oct 15 12:35:06 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/jvm01/",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:2002:STI,
  editor =       "{IEEE}",
  booktitle =    "{SC2002}: From Terabytes to Insight. Proceedings of
                 the {IEEE ACM SC 2002 Conference, November 16--22,
                 2002, Baltimore, MD, USA}",
  title =        "{SC2002}: From Terabytes to Insight. Proceedings of
                 the {IEEE ACM SC 2002 Conference, November 16--22,
                 2002, Baltimore, MD, USA}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "2002",
  ISBN =         "0-7695-1524-X",
  ISBN-13 =      "978-0-7695-1524-3",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{USENIX:2002:PBF,
  editor =       "{USENIX}",
  booktitle =    "Proceedings of BSDCon 2002: February 11--14, 2002,
                 Cathedral Hill Hotel, San Francisco, CA",
  title =        "Proceedings of {BSDCon} 2002: February 11--14, 2002,
                 Cathedral Hill Hotel, San Francisco, {CA}",
  publisher =    pub-USENIX,
  address =      pub-USENIX:adr,
  pages =        "viii + 151",
  year =         "2002",
  ISBN =         "1-880446-02-2",
  ISBN-13 =      "978-1-880446-02-7",
  LCCN =         "QA76.76.O63 B736 2002",
  bibdate =      "Tue Oct 15 12:45:29 2002",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/usenix2000.bib",
  URL =          "http://www.usenix.org/publications/library/proceedings/bsdcon02/tech.html",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2003:ATA,
  editor =       "Allyn Romanow and Jeff Mogul",
  booktitle =    "{Proceedings of the ACM SIGCOMM Workshop on
                 Network-I/O Convergence: experience, Lessons,
                 Implications 2003, Karlsruhe, Germany, August 25--27,
                 2003}",
  title =        "{Proceedings of the ACM SIGCOMM Workshop on
                 Network-I/O Convergence: experience, Lessons,
                 Implications 2003, Karlsruhe, Germany, August 25--27,
                 2003}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "????",
  year =         "2003",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "TK5105.5",
  bibdate =      "Sat Oct 14 14:04:48 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ACM order number 534032.",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2003:SII,
  editor =       "{ACM}",
  booktitle =    "SC2003: Igniting Innovation. {Phoenix, AZ, November
                 15--21, 2003}",
  title =        "{SC2003}: Igniting Innovation. {Phoenix, AZ, November
                 15--21, 2003}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2003",
  ISBN =         "1-58113-695-1",
  ISBN-13 =      "978-1-58113-695-1",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2003",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Anonymous:2003:CCV,
  editor =       "Anonymous",
  booktitle =    "Cool Chips VI:An International Symposium on Low-Power
                 and High-Speed Chips, Yokohama Joho Bunka Center,
                 Yokohama, Japan (Yokohama Media \& Communications
                 Center, Yokohama, Japan) April 16--18, 2003",
  title =        "Cool Chips {VI}:An International Symposium on
                 Low-Power and High-Speed Chips, Yokohama Joho Bunka
                 Center, Yokohama, Japan (Yokohama Media \&
                 Communications Center, Yokohama, Japan) April 16--18,
                 2003",
  publisher =    "????",
  address =      "????",
  pages =        "????",
  year =         "2003",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Fri Jan 09 16:53:37 2004",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cool-chips.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  acknowledgement = ack-nhfb,
}

@Proceedings{Chapman:2005:SMP,
  editor =       "Barbara M. Chapman",
  booktitle =    "{Shared memory parallel programming with OpenMP: 5th
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004:
                 Revised selected papers}",
  title =        "{Shared memory parallel programming with OpenMP: 5th
                 International Workshop on OpenMP Applications and
                 Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004:
                 Revised selected papers}",
  volume =       "3349",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "x + 147",
  year =         "2005",
  CODEN =        "LNCSD9",
  DOI =          "https://doi.org/10.1007/b105895",
  ISBN =         "3-540-24560-X",
  ISBN-13 =      "978-3-540-24560-5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76 .A1 L42 NO.3349",
  bibdate =      "Thu Jun 2 07:26:02 MDT 2005",
  bibsource =    "clavis.ucalgary.ca:2200/UNICORN;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  series =       ser-LNCS,
  URL =          "http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=3349;
                 http://www.springerlink.com/openurl.asp?genre=volume&id=doi:10.1007/b105895",
  acknowledgement = ack-nhfb,
  meetingname =  "International Workshop on OpenMP Applications and
                 Tools (2004: Houston, Tex.)",
  subject =      "Parallel programming (Computer science); Congresses",
}

@Proceedings{Lathrop:2011:SPI,
  editor =       "Scott Lathrop and Jim Costa and William Kramer",
  booktitle =    "{SC'11: Proceedings of 2011 International Conference
                 for High Performance Computing, Networking, Storage and
                 Analysis, Seattle, WA, November 12--18 2011}",
  title =        "{SC'11: Proceedings of 2011 International Conference
                 for High Performance Computing, Networking, Storage and
                 Analysis, Seattle, WA, November 12--18 2011}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2011",
  ISBN =         "1-4503-0771-X",
  ISBN-13 =      "978-1-4503-0771-0",
  LCCN =         "????",
  bibdate =      "Fri Dec 16 11:11:35 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib",
  acknowledgement = ack-nhfb,
  xxeditor =     "{ACM}",
}

@Proceedings{Hollingsworth:2012:SPI,
  editor =       "Jeffrey Hollingsworth",
  booktitle =    "{SC '12: Proceedings of the International Conference
                 on High Performance Computing, Networking, Storage and
                 Analysis, Salt Lake Convention Center, Salt Lake City,
                 UT, USA, November 10--16, 2012}",
  title =        "{SC '12: Proceedings of the International Conference
                 on High Performance Computing, Networking, Storage and
                 Analysis, Salt Lake Convention Center, Salt Lake City,
                 UT, USA, November 10--16, 2012}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  year =         "2012",
  ISBN =         "1-4673-0804-8",
  ISBN-13 =      "978-1-4673-0804-5",
  bibdate =      "Thu Nov 15 07:35:55 2012",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib",
  acknowledgement = ack-nhfb,
}