Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.81",
%%%     date            = "18 May 2024",
%%%     time            = "16:23:04 MST",
%%%     filename        = "tocs.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "63380 22260 115602 1165844",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "bibliography, BibTeX, ACM Transactions on
%%%                        Computer Systems",
%%%     license         = "public domain",
%%%     supported       = "no",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        the journal ACM Transactions on Computer
%%%                        Systems (CODEN ACSYEC, ISSN 0734-2071
%%%                        (print), 1557-7333 (electronic)), covering
%%%                        all journal issues from 1983 -- date.
%%%                        Publication began with volume 1, number 1, in
%%%                        1983. The journal appears quarterly, in
%%%                        February, May, August, and November.
%%%
%%%                        The journal has a World-Wide Web site at:
%%%
%%%                            http://www.acm.org/pubs/tocs
%%%
%%%                        Tables-of-contents are available at:
%%%
%%%                            http://www.acm.org/pubs/contents/journals/tocs/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J774
%%%
%%%                        There is currently coverage of all volumes,
%%%                        except 1 and 2 (1983--1984), at that site.
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        At version 1.81, the COMPLETE journal
%%%                        coverage looked like this:tocs.bib
%%%
%%%                             1983 (  24)    1997 (  16)    2011 (  12)
%%%                             1984 (  22)    1998 (  11)    2012 (  15)
%%%                             1985 (  15)    1999 (   9)    2013 (  12)
%%%                             1986 (  15)    2000 (  12)    2014 (   9)
%%%                             1987 (  18)    2001 (  13)    2015 (  13)
%%%                             1988 (  18)    2002 (  12)    2016 (  12)
%%%                             1989 (  13)    2003 (  13)    2017 (  13)
%%%                             1990 (  14)    2004 (  10)    2018 (   3)
%%%                             1991 (  15)    2005 (  13)    2019 (   9)
%%%                             1992 (  13)    2006 (  12)    2020 (   6)
%%%                             1993 (  13)    2007 (  11)    2021 (  19)
%%%                             1994 (  11)    2008 (  10)    2022 (   2)
%%%                             1995 (  12)    2009 (   8)    2023 (   5)
%%%                             1996 (  14)    2010 (   9)    2024 (   4)
%%%
%%%                             Article:        509
%%%                             Proceedings:      1
%%%
%%%                             Total entries:  510
%%%
%%%                        The initial draft was extracted from the
%%%                        ACM Web site, with manual corrections and
%%%                        additions from bibliographies in the TeX
%%%                        User Group collection, the author's
%%%                        personal bibliography files, the Compendex
%%%                        database, and a very large computer science
%%%                        bibliography collection on ftp.ira.uka.de
%%%                        in /pub/bibliography to which many people
%%%                        of have contributed.  Where multiple
%%%                        sources of a particular entry existed,
%%%                        field values have been manually merged to
%%%                        preserve maximal information.  Missing
%%%                        entries were identified by software
%%%                        developed for the TeX User Group and BibNet
%%%                        bibliography archive projects, and were
%%%                        then supplied from the original journal
%%%                        issues.  Questions arising from conflicting
%%%                        data were resolved by consulting the
%%%                        original journal issues.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        The bibsource keys in the bibliography
%%%                        entries below indicate the data sources,
%%%                        usually the Karlsruhe computer science
%%%                        bibliography archive for the first two
%%%                        volumes, or the journal Web site or the
%%%                        Compendex database, both of which lack
%%%                        coverage of this journal before 1985.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        Spelling has been verified with the UNIX
%%%                        spell and GNU ispell programs using the
%%%                        exception dictionary stored in the
%%%                        companion file with extension .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{
    "\input bibnames.sty"
  # "\ifx \undefined \circled \def \circled #1{(#1)}\fi"
  # "\ifx \undefined \reg \def \reg {\circled{R}}\fi"
  # "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TOCS                  = "ACM Transactions on Computer Systems"}

%%% ====================================================================
%%% Publisher abbreviations:
@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Jones:1983:EI,
  author =       "Anita K. Jones",
  title =        "{Editor}'s Introduction",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "1--2",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Reed:1983:IAA,
  author =       "David P. Reed",
  title =        "Implementing Atomic Actions on Decentralized Data",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "3--23",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Clark:1983:CPV,
  author =       "Douglas W. Clark",
  key =          "Clark",
  title =        "Cache Performance in the {VAX-11\slash 780}",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "24--37",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Oct 12 13:58:27 1984",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Math/sparse.linear.systems.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/storage.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The performance of memory caches is usually studied
                 through trace-driven simulation. This approach has
                 several drawbacks. Notably, it excludes realistic
                 multiprogramming, operating system, and I/O activity.
                 In this paper, cache performance is studied by direct
                 measurement of the hardware. A hardware monitor was
                 attached to a VAX-11/780 computer, whose cache was then
                 measured during normal use. A producible synthetic
                 timesharing workload was also run. This paper reports
                 measurements including the hit ratios of data and
                 instruction references, the rate of cache invalidations
                 by I/O, and the amount of waiting time due to cache
                 misses. Additional measurements were made with half the
                 cache disabled, and with the entire cache disabled.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Shamir:1983:GCS,
  author =       "Adi Shamir",
  title =        "On the Generation of Cryptographically Strong
                 Pseudorandom Sequences",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "38--44",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Cox:1983:ICP,
  author =       "George W. Cox and William M. Corwin and Konrad K. Lai
                 and Fred J. Pollack",
  title =        "Interprocess Communication, and Processor Dispatching
                 on the {Intel 432}",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "45--66",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 9 09:46:02 1986",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/os.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Parallel/Multi.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "A unified facility for interprocess communication and
                 processor dispatching on the Intel 432 is described.
                 The facility is based on a queuing and binding
                 mechanism called a port. The goals and motivations for
                 ports, both abstract and implementation views of them,
                 and their absolute and comparative performance are
                 described.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
  owner =        "seufert",
}

@Article{Sauer:1983:CAS,
  author =       "Charles H. Sauer",
  title =        "Computational Algorithms for State-Dependent Queueing
                 Networks",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "67--92",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Distributed/QLD/1983.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "See corrigendum \cite{Sauer:1983:CCA}.",
  acknowledgement = ack-nhfb,
  annote =       "\ldots{} in this paper the author limits the material
                 reviewed to three forms of state dependency in queueing
                 networks which have the product form. The major part of
                 the paper address state-dependent routing, in which the
                 probability of entering a queue of a subnetwork depend
                 upon the quotient of a linear function of the number of
                 customers in that queue and another linear function of
                 the total number of customers in the subnetwork \ldots{}",
  country =      "USA",
  date =         "28/09/84",
  descriptors =  "Queueing network; method; state dependent queueing;
                 MVA; CONVOLUTION ALGORITHM",
  enum =         "2690",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
  language =     "English",
  location =     "RWTH-AC-DFV: TELL",
  references =   "26",
  revision =     "21/04/91",
}

@Article{Anonymous:1983:IA,
  author =       "Anonymous",
  title =        "Information for Authors",
  journal =      j-TOCS,
  volume =       "1",
  number =       "1",
  pages =        "93--95",
  month =        feb,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Schwetman:1983:PSI,
  author =       "Herbert D. Schwetman",
  title =        "Preface to the Special Issue",
  journal =      j-TOCS,
  volume =       "1",
  number =       "2",
  pages =        "97--98",
  month =        may,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Eager:1983:PBH,
  author =       "Derek L. Eager and Kenneth C. Sevcik",
  title =        "Performance Bound Hierarchies for Queueing Networks",
  journal =      j-TOCS,
  volume =       "1",
  number =       "2",
  pages =        "99--115",
  month =        may,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Agrawal:1983:ASM,
  author =       "Subhash C. Agrawal and Jeffrey P. Buzen",
  title =        "The Aggregate Server Method for Analyzing
                 Serialization Delays in Computer Systems",
  journal =      j-TOCS,
  volume =       "1",
  number =       "2",
  pages =        "116--143",
  month =        may,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Distributed/QLD/1982.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/IMMD_IV.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  annote =       "An approximate, iterative method is presented to
                 estimate the delays caused by programs waiting to enter
                 critical sections and other software control structures
                 in which mutual exclusion is enforced (i.e.,
                 one-at-a-time or serialized processing). Some common
                 shares of such serialization delays include routines
                 that perform resource allocation, modify internal data
                 structures, or update external files and
                 databases \ldots{}",
  country =      "USA",
  date =         "02/12/83",
  descriptors =  "Queueing approximation; process management; aggregate
                 server method; serialization; resource allocation",
  enum =         "38",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  language =     "English",
  location =     "RWTH-AC-DFV: Bibl.",
  references =   "0",
  revision =     "19/03/92",
}

@Article{Chandy:1983:DDD,
  author =       "K. Mani Chandy and Laura M. Haas and Jayadev Misra",
  title =        "Distributed Deadlock Detection",
  journal =      j-TOCS,
  volume =       "1",
  number =       "2",
  pages =        "144--156",
  month =        may,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Distributed/QLD/1983.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/Discrete.event.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  annote =       "Distributed deadlock models are presented for resource
                 and communication deadlocks. Simple distributed
                 algorithms for detection of these deadlocks are given.
                 We show that all true deadlocks are detected and that
                 no false deadlocks are reported. In our algorithms, no
                 process maintains global information; all messages have
                 an identical short length. The algorithms can be
                 applied in distributed database and other message
                 communication systems.",
  country =      "USA",
  date =         "00/00/00",
  descriptors =  "DISTRIBUTED SIMULATION; COMPUTER NETWORK; DEADLOCK",
  enum =         "8087",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
  language =     "English",
  location =     "UniS-IND-DS_C",
  references =   "15",
  revision =     "19/10/93",
  xxnote =       "Check author order??",
}

@Article{Cappello:1983:VLP,
  author =       "Peter R. Cappello and Kenneth Steiglitz",
  title =        "A {VLSI} Layout for a Pipelined {Dadda} Multiplier",
  journal =      j-TOCS,
  volume =       "1",
  number =       "2",
  pages =        "157--174",
  month =        May,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Math/computer.arithmetic.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "Reprinted in E. E. Swartzlander, {\em Computer
                 Arithmetic}, Vol. 2, IEEE Computer Society Press
                 Tutorial, Los Alamitos, CA, 1990.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "multiplication",
}

@Article{Blum:1983:HES,
  author =       "Manuel Blum",
  title =        "How to Exchange (Secret) Keys",
  journal =      j-TOCS,
  volume =       "1",
  number =       "2",
  pages =        "175--193",
  month =        may,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Theory/crypto.security.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "Previously published in ACM STOC '83 proceedings,
                 pages 440--447.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hoshino:1983:PPM,
  author =       "Tsutomu Hoshino and Toshio Kawai and Tomonori
                 Shirakawa and Junichi Higashino and Akira Yamaoka and
                 Hachidai Ito and Takashi Sato and Kazuo Sawada",
  title =        "{PACS}: a Parallel Microprocessor Array for Scientific
                 Calculations",
  journal =      j-TOCS,
  volume =       "1",
  number =       "3",
  pages =        "195--221",
  month =        aug,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Parallel/ovr.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Schlichting:1983:FSP,
  author =       "Richard D. Schlichting and Fred B. Schneider",
  title =        "Fail-Stop Processors: An Approach to Designing
                 Fault-Tolerant Computing Systems",
  journal =      j-TOCS,
  volume =       "1",
  number =       "3",
  pages =        "222--238",
  month =        aug,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Distributed/distfs.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/SE/dependability.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Akl:1983:CSP,
  author =       "Selim G. Akl and Peter D. Taylor",
  title =        "Cryptographic Solution to a Problem of Access Control
                 in a Hierarchy",
  journal =      j-TOCS,
  volume =       "1",
  number =       "3",
  pages =        "239--248",
  month =        aug,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bauer:1983:KDP,
  author =       "R. K. Bauer and T. A. Berson and R. J. Feiertag",
  title =        "A Key Distribution Protocol Using Event Markers",
  journal =      j-TOCS,
  volume =       "1",
  number =       "3",
  pages =        "249--255",
  month =        aug,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Misc/misc.1.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kemmerer:1983:SRM,
  author =       "Richard A. Kemmerer",
  title =        "Shared Resource Matrix Methodology: An Approach to
                 Identifying Storage and Timing Channels",
  journal =      j-TOCS,
  volume =       "1",
  number =       "3",
  pages =        "256--277",
  month =        aug,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jones:1983:PSI,
  author =       "Anita K. Jones",
  title =        "Preface to Special Issue",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "279--280",
  month =        nov,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Strecker:1983:TBC,
  author =       "William D. Strecker",
  title =        "Transient Behavior of Cache Memories",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "281--293",
  month =        nov,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kobayashi:1983:ORC,
  author =       "Hiroshi Kobayashi and Mario Gerla",
  title =        "Optimal Routing in Closed Queueing Networks",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "294--310",
  month =        nov,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Distributed/QLD/1983.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  annote =       "\ldots{} This paper addresses the problem of obtaining
                 the set of routing probabilities that will minimize
                 response time, or alternatively maximize the
                 throughput. An algorithm, called the flow deviation
                 (FD) algorithm, is already known for obtaining the
                 optimal routing probabilities for open queueing network
                 models \ldots{}",
  country =      "USA",
  date =         "28/11/84",
  descriptors =  "Closed queueing network; routing algorithm",
  enum =         "1726",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  language =     "English",
  location =     "RWTH-AC-DFV: Bibl.",
  references =   "0",
  revision =     "21/04/91",
}

@Article{Sloan:1983:MEB,
  author =       "Lansing Sloan",
  title =        "Mechanisms that Enforce Bounds on Packet Lifetimes",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "311--330",
  month =        nov,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Shankar:1983:HPS,
  author =       "A. Udaya Shankar and Simon S. Lam",
  title =        "An {HDLC} Protocol Specification and Its Verifications
                 Using Image Protocols",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "331--368",
  month =        nov,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Sauer:1983:CCA,
  author =       "Charles H. Sauer",
  title =        "Corrigendum: Computational Algorithms for
                 State-Dependent Queuing Networks",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "369--369",
  month =        nov,
  year =         "1983",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Distributed/QLD/1983.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "See \cite{Sauer:1983:CAS}.",
  acknowledgement = ack-nhfb,
  country =      "USA",
  date =         "13/05/93",
  descriptors =  "Queueing network; product form; analysis",
  enum =         "7840",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  language =     "English",
  location =     "SEL: Wi",
  references =   "0",
  revision =     "16/01/94",
}

@Article{Anonymous:1984:I,
  author =       "Anonymous",
  title =        "Index",
  journal =      j-TOCS,
  volume =       "1",
  number =       "4",
  pages =        "370--371",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jones:1984:PSI,
  author =       "Anita K. Jones",
  title =        "Preface to Special Issue",
  journal =      j-TOCS,
  volume =       "2",
  number =       "1",
  pages =        "1--1",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Schroeder:1984:EGG,
  author =       "Michael D. Schroeder and Andrew D. Birrell and Roger
                 M. Needham",
  title =        "Experience with {Grapevine}: The Growth of a
                 Distributed System",
  journal =      j-TOCS,
  volume =       "2",
  number =       "1",
  pages =        "3--23",
  month =        feb,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 15 14:49:51 1987",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Database/Wiederhold/1984.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Distributed/Danzig.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/os.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Grapevine is a distributed, replicated system that
                 provides message delivery, naming, authentication,
                 resource location, and access control services in an
                 internet of computers. The system, described in a
                 previous paper [1], was designed and implemented
                 several years ago. We now have had operational
                 experience with the system under substantial load. In
                 this paper we report on what we have learned from using
                 Grapevine.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Design; experimentation; Grapevine; reliability; TOCS
                 operating systems distributed systems database
                 systems",
  owner =        "manning",
}

@Article{Lindsay:1984:CCR,
  author =       "Bruce G. Lindsay and Laura M. Haas and C. Mohan and
                 Paul F. Wilms and Robert A. Yost",
  title =        "Computation and Communication in {R}: a Distributed
                 Database Manager",
  journal =      j-TOCS,
  volume =       "2",
  number =       "1",
  pages =        "24--38",
  month =        feb,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Database/Wiederhold/1984.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "Also published in/as: SOSP 9, Bretton Woods, Oct.
                 1983.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Rstar; TOCS",
}

@Article{Birrell:1984:IRP,
  author =       "Andrew D. Birrell and Bruce Jay Nelson",
  key =          "Birrell \& Nelson",
  title =        "Implementing Remote Procedure Calls",
  journal =      j-TOCS,
  volume =       "2",
  number =       "1",
  pages =        "39--59",
  month =        feb,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 15 14:59:58 1987",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Compiler/bcp.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Distributed/networks.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/misc.1.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/IMMD_IV.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/os.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/SE/dependability.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Remote procedure calls (RPC) appear to be a useful
                 paradigm for providing communication across a network
                 between programs written in a high-level language. This
                 paper describes a package providing a remote procedure
                 call facility, the options that face the designer of
                 such a package, and the decisions we made. We describe
                 the overall structure of our RPC mechanism, our
                 facilities for binding RPC clients, the transport
                 level, communication protocol, and some performance
                 measurements. We include descriptions of some
                 optimizations used to achieve high performance and to
                 minimize the load on server machines that have many
                 clients.",
  acknowledgement = ack-nhfb,
  checked =      yes,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Design; distributed naming and binding;
                 experimentation; inter-process communication;
                 performance; performance of communication protocols;
                 remote procedure calls; RPC, Cedar; RPC, transport
                 layer protocol; security; TOCS; transport layer
                 protocols",
  memos =        "The idea of RPC was first suggested by J. E. White in
                 a paper entitled ``A high-level framework for
                 network-based resource sharing'' in the Proceedings of
                 the National Computer Conference in June 1976. The
                 implementation of RPC described in the paper is the one
                 from the {\em Cedar\/} project at Xerox.",
  owner =        "manning",
}

@Article{Berkovich:1984:CCT,
  author =       "Simon Y. Berkovich and Colleen Roe Wilson",
  title =        "A Computer Communication Technique Using
                 Content-Induced Transaction Overlap",
  journal =      j-TOCS,
  volume =       "2",
  number =       "1",
  pages =        "60--77",
  month =        feb,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Kameda:1984:OCP,
  author =       "Hisao Kameda",
  title =        "Optimality of a Central Processor Scheduling Policy
                 for Processing a Job Stream",
  journal =      j-TOCS,
  volume =       "2",
  number =       "1",
  pages =        "78--90",
  month =        feb,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/IMMD_IV.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Smith:1984:PSI,
  author =       "Alan Jay Smith",
  title =        "Preface to Special Issue",
  journal =      j-TOCS,
  volume =       "2",
  number =       "2",
  pages =        "91--92",
  month =        may,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Marsan:1984:CGS,
  author =       "Marco Ajmone Marsan and Gianni Conte and Gianfranco
                 Balbo",
  title =        "A Class of Generalized Stochastic {Petri} Nets for the
                 Performance Evaluation of Multiprocessor Systems",
  journal =      j-TOCS,
  volume =       "2",
  number =       "2",
  pages =        "93--122",
  month =        may,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Misc/Discrete.event.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/IMMD_IV.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/SE/uni-do.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "GSPN",
}

@Article{Tantawi:1984:PAC,
  author =       "Asser N. Tantawi and Manfred Ruschitzka",
  title =        "Performance Analysis of Checkpointing Strategies",
  journal =      j-TOCS,
  volume =       "2",
  number =       "2",
  pages =        "123--144",
  month =        may,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Schneider:1984:BGA,
  author =       "Fred B. Schneider",
  title =        "{Byzantine} Generals in Action: Implementing Fail-Stop
                 Processors",
  journal =      j-TOCS,
  volume =       "2",
  number =       "2",
  pages =        "145--154",
  month =        may,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Distributed/distfs.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/IMMD_IV.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/SE/dependability.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Stamos:1984:SGS,
  author =       "James W. Stamos",
  title =        "Static Grouping of Small Objects to Enhance
                 Performance of a Paged Virtual Memory",
  journal =      j-TOCS,
  volume =       "2",
  number =       "2",
  pages =        "155--180",
  month =        may,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Compiler/gc.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Parallel/distmem.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Smalltalk is an object-oriented, interactive
                 programming environment that maintains sate between
                 user sessions. Because of the persistence of objects,
                 it is possible to use program restructuring techniques
                 to statically relocate objects in virtual memory.
                 Grouping related objects on the same disk page
                 increases locality of reference, reduces the number of
                 page faults, and improves performance. Five types of
                 static grouping algorithms along with the static
                 analysis performed on their outputs, and empirical
                 evidence of their performance are presented.",
  acknowledgement = ack-nhfb,
  comment =      "Using the garbage collector to improve performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "clustering TOCS",
}

@Article{McKusick:1984:FFS,
  author =       "Marshall K. McKusick and William N. Joy and Sam J.
                 Leffler and Robert S. Fabry",
  key =          "McKusick et al.",
  title =        "A Fast File System for {UNIX}",
  journal =      j-TOCS,
  volume =       "2",
  number =       "3",
  pages =        "181--197",
  month =        aug,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Feb 7 10:11:41 1985",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/unix.1.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "A reimplementation of the UNIX file system is
                 described. The reimplementation provides substantially
                 higher throughput rates by using more flexible
                 allocation policies that allow better locality of
                 reference and can be adapted to a wide range of
                 peripheral and processor characteristics. The new file
                 system clusters data that is sequentially accessed and
                 provides two block sizes to allow fast access to large
                 files while not wasting large amounts of space for
                 small files. File access rates of up to ten times
                 faster than the traditional UNIX file system are
                 experienced. Long-needed enhancements to the
                 programmers' interface are discussed. These include a
                 mechanism to place advisory locks on files, extensions
                 of the name space across file systems, the ability to
                 use long file names, and provisions for administrative
                 control of resource usage.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS; UNIX, file system organization, file system
                 performance, file system design, application program
                 interface",
}

@Article{Landwehr:1984:SMM,
  author =       "Carl E. Landwehr and Constance L. Heitmeyer and John
                 McLean",
  title =        "A Security Model for Military Message System",
  journal =      j-TOCS,
  volume =       "2",
  number =       "3",
  pages =        "198--222",
  month =        aug,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Wiederhold/1984.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/bibsec.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Schwarz:1984:SSA,
  author =       "Peter M. Schwarz and Alfred Z. Spector",
  title =        "Synchronizing Shared Abstract Types",
  journal =      j-TOCS,
  volume =       "2",
  number =       "3",
  pages =        "223--250",
  month =        aug,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Chang:1984:RBP,
  author =       "Jo-Mei Chang and N. F. Maxemchuk",
  title =        "Reliable Broadcast Protocols",
  journal =      j-TOCS,
  volume =       "2",
  number =       "3",
  pages =        "251--273",
  month =        aug,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Compiler/gc.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/misc.1.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/SE/dependability.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  comment =      "Atomic multicast protocol.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Consensus / Broadcast; Multicast; TOCS operating
                 distributed systems reliability networks
                 communication",
}

@Article{Anonymous:1984:IA,
  author =       "Anonymous",
  title =        "Information for Authors",
  journal =      j-TOCS,
  volume =       "2",
  number =       "3",
  pages =        "274--276",
  month =        aug,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:18:40 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Saltzer:1984:EEA,
  author =       "J. H. Saltzer and D. P. Reed and D. D. Clark",
  key =          "Saltzer et al.",
  title =        "End-to-End Arguments in System Design",
  journal =      j-TOCS,
  volume =       "2",
  number =       "4",
  pages =        "277--288",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Mar 6 11:12:06 1985",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/digital.library.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/os.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/SE/dependability.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This paper presents a design principle that helps
                 guide placement of functions among the modules of a
                 distributed computer system. The principle, called the
                 end-to-end argument, suggests that functions placed at
                 low levels of a system may be redundant or of little
                 value when compared with the cost of providing them at
                 that low level. Examples discussed in the paper include
                 bit-error recovery, security using encryption,
                 duplicate message suppression, recovery from system
                 crashes, and delivery acknowledgement. Low-level
                 mechanisms to support these functions are justified
                 only as performed enhancements.",
  acknowledgement = ack-nhfb,
  comments =     "Argues that you should put functionality at the higher
                 app layers, rather than at low layers. Includes a
                 security example",
  entered-by =   "Andreas Paepcke",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "data communication; design; design principles;
                 protocol design; TOCS",
}

@Article{Smith:1984:DAE,
  author =       "James E. Smith",
  title =        "Decoupled Access\slash Execute Computer
                 Architectures",
  journal =      j-TOCS,
  volume =       "2",
  number =       "4",
  pages =        "289--308",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Math/sparse.linear.systems.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Tichy:1984:SSC,
  author =       "Walter F. Tichy",
  title =        "The String-to-String Correction Problem with Block
                 Moves",
  journal =      j-TOCS,
  volume =       "2",
  number =       "4",
  pages =        "309--321",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/allison.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Misc/protein.pattern.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Uses block moves as the edit operation, seeks min'
                 number, gets a $O(n)$ linear algorithm if P. Weiner's
                 data-structure used. $S$ source string, $T$ target
                 string. Algorithm: find longest prefix of $T$ that is a
                 substring of $S$, this gives the first block move;
                 repeat until done. Proof: by induction on number of
                 block moves. One block move - $T$ must obviously be a
                 substring of $S$, alg' finds this. Suppose optimal is
                 $i$ block moves and alg' fails, i.e., finds $j > i$
                 moves.
                 T:----Opt1----$|$--Opt2--$|$---Opt3---$|$.....$|$--Algi--
                 T:-----Alg1-----$|$---Alg2---$|$.....$|$-----Algj------
                 NB. $|$Alg1$|$ $>$= $|$Opt1$|$ Delete the substring
                 Alg1. By induction the alg' would find the opt'
                 explanation of the rest of $T$ - but it doesn't :-
                 contradiction.",
  acknowledgement = ack-nhfb,
  comment =      "``An algorithm that produces the shortest edit
                 sequence transforming one string into another is
                 presented. The algorithm is optimal in the sense that
                 it generated a minimal covering set of common
                 substrings of one string with respect to another. Two
                 improvements of the basic algorithm are developed. The
                 first improvement performs well on strings with few
                 replicated symbols. The second improvement runs in time
                 and space linear to the size of the input. Efficient
                 algorithms for regenerating a string from an edit
                 sequence are also presented.'' longest common
                 sequence",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "longest common subsequence, LCS, LCSS, edit distance,
                 block, move, TOCS, string to strings, sequence,
                 alignment, linear, algorithm; TOCS",
}

@Article{Rom:1984:OSC,
  author =       "Raphael Rom",
  title =        "Ordering Subscribers on Cable Networks",
  journal =      j-TOCS,
  volume =       "2",
  number =       "4",
  pages =        "322--334",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Bryant:1984:MPA,
  author =       "Raymond M. Bryant and Anthony E. Krzesinski and M.
                 Seetha Lakshmi and K. Mani Chandy",
  title =        "The {MVA} Priority Approximation",
  journal =      j-TOCS,
  volume =       "2",
  number =       "4",
  pages =        "335--359",
  month =        nov,
  year =         "1984",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 11:57:59 1999",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Database/Graefe.bib;
                 ftp://ftp.ira.uka.de/pub/bibliography/Os/IMMD_IV.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "TOCS",
}

@Article{Birrell:1985:SCU,
  author =       "Andrew D. Birrell",
  title =        "Secure Communication Using Remote Procedure Calls",
  journal =      j-TOCS,
  volume =       "3",
  number =       "1",
  pages =        "1--14",
  month =        feb,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-1/p1-birrell/",
  abstract =     "Research on encryption-based secure communication
                 protocols has reached a stage where it is feasible to
                 construct end-to-end secure protocols. The design of
                 such a protocol, built as part of a remote procedure
                 call package, is described. The security abstraction
                 presented to users of the package, the authentication
                 mechanisms, and the protocol for encrypting and
                 verifying remote calls are also described.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Xerox Corp",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer networks; cryptography; design;
                 experimentation; Protocols; remote procedure calls;
                 secure communication; security",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Cryptographic controls. {\bf C.2.0}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, General, Security and protection (e.g.,
                 firewalls). {\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture.",
}

@Article{Skeen:1985:DLP,
  author =       "Dale Skeen",
  title =        "Determining the Last Process to Fail",
  journal =      j-TOCS,
  volume =       "3",
  number =       "1",
  pages =        "15--30",
  month =        feb,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-1/p15-skeen/",
  abstract =     "A total failure occurs whenever all processes
                 cooperatively executing a distributed task fail before
                 the task completes. A frequent prerequisite for
                 recovery from a total failure is identification of the
                 last set (LAST) of processes to fail. Necessary and
                 sufficient conditions are derived here for computing
                 LAST from the local failure data of recovered
                 processes. These conditions are then translated into
                 procedures for deciding LAST membership, using either
                 complete or incomplete failure data. The choice of
                 failure data is itself dictated by two requirements:
                 (1) it can be cheaply maintained, and (2) it must
                 afford maximum fault-tolerance in the sense that the
                 expected number of recoveries required for identifying
                 LAST is minimized.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Cornell Univ, Ithaca, NY, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer systems, digital; cooperative
                 processes; database systems --- Distributed;
                 Distributed; event ordering; reliability; total
                 failure",
  subject =      "{\bf D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Reliability,
                 availability, and serviceability. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability, Checkpoint/restart.
                 {\bf H.2.2} Information Systems, DATABASE MANAGEMENT,
                 Physical Design, Recovery and restart.",
}

@Article{Clark:1985:PVT,
  author =       "Douglas W. Clark and Joel S. Emer",
  title =        "Performance of the {VAX-11\slash 780} Translation
                 Buffer: Simulation and Measurement",
  journal =      j-TOCS,
  volume =       "3",
  number =       "1",
  pages =        "31--62",
  month =        feb,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-1/p31-clark/",
  abstract =     "A virtual-address translation buffer (TB) is a
                 hardware cache of recently used virtual-to-physical
                 address mappings. The authors present the results of a
                 set of measurements and simulations of translation
                 buffer performance in the VAX-11\slash 780. Two
                 different hardware monitors were attached to
                 VAX-11\slash 780 computers, and translation buffer
                 behavior was measured. Measurements were made under
                 normal time-sharing use and while running reproducible
                 synthetic time-sharing work loads. Reported
                 measurements include the miss ratios of data and
                 instruction references, the rate of TB invalidations
                 due to context switches, and the amount of time taken
                 to service TB misses. Additional hardware measurements
                 were made with half the TB disabled. Trace-driven
                 simulations of several programs were also run; the
                 traces captured system activity as well as user-mode
                 execution. Several variants of the 11\slash 780 TB
                 structure were simulated.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Digital Equipment Corp, Littleton, MA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "cache memories; computer simulation; computers,
                 digital --- Performance; data storage units; design;
                 experimentation; hardware monitor; measurement;
                 performance; trace-driven simulation; translation
                 buffer",
  subject =      "{\bf C.1.1} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Single Data Stream Architectures, VAX.
                 {\bf B.3.2} Hardware, MEMORY STRUCTURES, Design Styles,
                 Associative memories. {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Cache memories. {\bf B.3.2}
                 Hardware, MEMORY STRUCTURES, Design Styles, Virtual
                 memory. {\bf B.3.3} Hardware, MEMORY STRUCTURES,
                 Performance Analysis and Design Aids**, Simulation**.",
}

@Article{Chandy:1985:DSD,
  author =       "K. Mani Chandy and Leslie Lamport",
  title =        "Distributed Snapshots: Determining Global States of
                 Distributed Systems",
  journal =      j-TOCS,
  volume =       "3",
  number =       "1",
  pages =        "63--75",
  month =        feb,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-1/p63-chandy/",
  abstract =     "This paper presents an algorithm by which a process in
                 a distributed system determines a global state of the
                 system during a computation. Many problems in
                 distributed systems can be cast in terms of the problem
                 of detecting global states. For instance, the global
                 state detection algorithm helps to solve an important
                 class of problems: stable property detection. A stable
                 property is one that persists: once a stable property
                 becomes true it remains true thereafter. Examples of
                 stable properties are `computation has terminated',
                 `the system is deadlocked' and `all tokens in a token
                 ring have disappeared. ' The stable property detection
                 problem is that of devising algorithms to detect a
                 given stable property. Global state detection can also
                 be used for checkpointing.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Texas at Austin, Austin, TX, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer programming --- Algorithms;
                 computer systems, digital; Distributed; distributed
                 deadlock detection; distributed snapshots; global
                 states",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Concurrency. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Deadlocks. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Mutual exclusion. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Scheduling. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Synchronization. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Backup procedures. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability,
                 Checkpoint/restart. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Fault-tolerance. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability,
                 Verification.",
}

@Article{Cheriton:1985:DPG,
  author =       "David R. Cheriton and Willy Zwaenepoel",
  title =        "Distributed Process Groups in the {V} Kernel",
  journal =      j-TOCS,
  volume =       "3",
  number =       "2",
  pages =        "77--107",
  month =        may,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-2/p77-cheriton/",
  abstract =     "The V kernel supports an abstraction of processes,
                 with operations for interprocess communication, process
                 management, and memory management. This abstraction is
                 used as a software base for constructing distributed
                 systems. As a distributed kernel, the V kernel makes
                 intermachine boundaries largely transparent. In this
                 environment of many cooperating processes on different
                 machines, there are many logical groups of processes.
                 In this paper we describe the extension of the V kernel
                 to support process groups. Operations on groups include
                 group interprocess communication. Aspects of the
                 implementation and performance, and initial experience
                 with applications are discussed.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Stanford Univ, Stanford, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer operating systems; computer
                 systems, digital --- Distributed; design; distributed
                 process groups; measurement; performance; V kernel",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf D.4.7} Software, OPERATING
                 SYSTEMS, Organization and Design.",
}

@Article{Even:1985:PCC,
  author =       "S. Even and O. Goldreich",
  title =        "On the Power of Cascade Ciphers",
  journal =      j-TOCS,
  volume =       "3",
  number =       "2",
  pages =        "108--116",
  month =        may,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-2/p108-even/",
  abstract =     "The unicity distance of a cascade of random ciphers,
                 with respect to known plaintext attack, is shown to be
                 the sum of the key lengths. At time-space trade-off for
                 the exhaustive cracking of a cascade of ciphers is
                 shown. The structure of the set of permutations
                 realized by a cascade is studied; it is shown that only
                 l. 2**k exhaustive experiments are necessary to
                 determine the behavior of a cascade of l stages, each
                 having k key bits. It is concluded that the cascade of
                 random ciphers is not a random cipher. Yet, it is shown
                 that, with probability, the number of permutations
                 realizable by a cascade of l random ciphers, each
                 having k key bits, is 2**l**k. Next, it is shown that
                 two stages are not worse than one, by a simple
                 reduction of the cracking problem of any of the stages
                 to the cracking problem of the cascade. Finally, it is
                 shown that proving a nonpolynomial lower bound on the
                 cracking problem of long cascades is a hard task, since
                 such a bound implies that P does not equal NP.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Technion-Israel Inst of Technology, Haifa, Isr",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; cascade ciphers; cryptography; data
                 encryption; data processing --- Security of Data;
                 random ciphers; security; theory; unicity distance",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection. {\bf E.3} Data, DATA ENCRYPTION.",
}

@Article{Padmanabhan:1985:PAR,
  author =       "Krishnan Padmanabhan and Duncan H. Lawrie",
  title =        "Performance Analysis of Redundant-Path Networks for
                 Multiprocessor Systems",
  journal =      j-TOCS,
  volume =       "3",
  number =       "2",
  pages =        "117--144",
  month =        may,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-2/p117-padmanabhan/",
  abstract =     "Performance of a class of multistage interconnection
                 networks employing redundant paths is investigated.
                 Redundant path networks provide significant tolerance
                 to faults at minimal costs; in this paper improvements
                 in performance and very graceful degradation are also
                 shown to result from the availability of redundant
                 paths. A Markov model is introduced for the operation
                 of these networks in the circuit-switched mode and is
                 solved numerically to obtain the performance measures
                 of interest. The structure of the networks that provide
                 maximal performance is also characterized.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Illinois at Urbana-Champaign, Urbana,
                 IL, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer systems, digital; design; Multiprocessing;
                 multistage interconnection networks; performance;
                 performance analysis; redundant-path networks",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Performance attributes. {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors),
                 Interconnection architectures. {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors),
                 Multiple-instruction-stream, multiple-data-stream
                 processors (MIMD). {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS, Design studies. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS,
                 Modeling techniques.",
}

@Article{Maekawa:1985:AME,
  author =       "Mamoru Maekawa",
  title =        "A {$\sqrt{N}$} Algorithm for Mutual Exclusion in
                 Decentralized Systems",
  journal =      j-TOCS,
  volume =       "3",
  number =       "2",
  pages =        "145--159",
  month =        may,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-2/p145-maekawa/",
  abstract =     "An algorithm is presented that uses only c ROOT N
                 messages to create mutual exclusion in a computer
                 network, where N is the number of nodes and c a
                 constant between 3 and 5. The algorithm is symmetric
                 and allows fully parallel operation.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Tokyo, Dep of Information Science,
                 Tokyo, Jpn",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer networks; computer programming
                 --- Algorithms; decentralized systems; design; mutual
                 exclusion; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Mutual exclusion. {\bf C.2.1} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Network Architecture and Design, Network
                 communications. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Network operating systems.",
}

@Article{Smith:1985:DCM,
  author =       "Alan Jay Smith",
  title =        "Disk Cache --- Miss Ratio Analysis and Design
                 Considerations",
  journal =      j-TOCS,
  volume =       "3",
  number =       "3",
  pages =        "161--203",
  month =        aug,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-3/p161-smith/",
  abstract =     "The current trend of computer system technology is
                 toward CPUs with rapidly increasing processing power
                 and toward disk drives of rapidly increasing density,
                 but with disk performance increasing very slowly if at
                 all. The implication of these trends is that at some
                 point the processing power of computer systems will be
                 limited by the throughput of the input\slash output
                 (I/O) system. A solution to this problem, which is
                 described and evaluated in this paper, is disk cache.
                 The idea is to buffer recently used portions of the
                 disk address space in electronic storage. Experimental
                 results are based on extensive trace-driven simulations
                 using traces taken from three large IBM or
                 IBM-compatible mainframe data processing installations.
                 We find that disk cache is a powerful means of
                 extending the performance limits of high-end computer
                 systems.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of California, Dep of Electrical Engineering \&
                 Computer Sciences, Berkeley, CA, USA",
  affiliationaddress = "Univ of California, Dep of Electrical
                 Engineering \& Computer Sciences, Berkeley, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "cache controller; computer systems, digital; data
                 storage units; design; disk cache; experimentation; I/O
                 buffer; measurement; performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf B.4.2} Hardware,
                 INPUT/OUTPUT AND DATA COMMUNICATIONS, Input/Output
                 Devices, Channels and controllers. {\bf B.3.2}
                 Hardware, MEMORY STRUCTURES, Design Styles, Mass
                 storage. {\bf B.3.3} Hardware, MEMORY STRUCTURES,
                 Performance Analysis and Design Aids**. {\bf D.4.2}
                 Software, OPERATING SYSTEMS, Storage Management.",
}

@Article{Strom:1985:ORD,
  author =       "Robert E. Strom and Shaula Yemini",
  title =        "Optimistic Recovery in Distributed Systems",
  journal =      j-TOCS,
  volume =       "3",
  number =       "3",
  pages =        "204--226",
  month =        aug,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-3/p204-strom/",
  abstract =     "Optimistic Recovery is a new technique supporting
                 application-independent transparent recovery from
                 processor failures in distributed systems. In
                 optimistic recovery communication, computation and
                 checkpointing proceed asynchronously. Synchronization
                 is replaced by causal dependency tracking, which
                 enables a posteriori reconstruction of a consistent
                 distributed system state following a failure using
                 process rollback and message replay. Because there is
                 no synchronization among computation, communication,
                 and checkpointing, optimistic recovery can tolerate the
                 failure of an arbitrary number of processors and yields
                 better throughput and response time than other general
                 recovery techniques whenever failures are infrequent.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "IBM, Thomas J. Watson Research Cent, Yorktown
                 Heights, NY, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer operating systems; computer
                 programming --- Algorithms; computer systems, digital;
                 Distributed; optimistic algorithms; optimistic
                 recovery; reliability; verification",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf C.1.2} Computer Systems Organization,
                 PROCESSOR ARCHITECTURES, Multiple Data Stream
                 Architectures (Multiprocessors). {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming.",
}

@Article{Tay:1985:EBP,
  author =       "Y. C. Tay and Rajan Suri",
  title =        "Error Bounds for Performance Prediction in Queuing
                 Networks",
  journal =      j-TOCS,
  volume =       "3",
  number =       "3",
  pages =        "227--254",
  month =        aug,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-3/p227-tay/",
  abstract =     "Analytic models based on closed queuing networks
                 (CQNS) are widely used for performance prediction in
                 practical systems. In using such models, there is
                 always a prediction error, that is, a difference
                 between the predicted performance and actual outcome.
                 This prediction error is due both to modeling errors
                 and estimation errors, the latter being the difference
                 between the estimated values of the CQN parameters and
                 the actual outcomes. This paper considers the second
                 class of errors; in particular, it studies the effect
                 of small estimation errors and provides bounds on
                 prediction errors based on bounds on estimation errors.
                 Estimation errors may be divided into two types: (1)
                 the difference between the estimated value and the
                 average value of the outcome, and (2) the deviation of
                 the actual value from its average. The analysis first
                 studies the sum of both types of errors, then the
                 second type alone. The results are illustrated with
                 three examples.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Natl Univ of Singapore, Dep of Mathematics, Kent
                 Ridge, Singapore",
  classification = "722; 723; 922",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "closed queuing networks; computer systems, digital;
                 error bounds; measurement; performance; Performance;
                 probability --- Queueing Theory; product form networks;
                 queuing networks; verification",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Modeling techniques. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Stochastic analysis.
                 {\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Queueing theory. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Modeling and prediction.",
}

@Article{Brown:1985:AFS,
  author =       "Mark R. Brown and Karen N. Kolling and Edward A.
                 Taft",
  title =        "The {Alpine} File System",
  journal =      j-TOCS,
  volume =       "3",
  number =       "4",
  pages =        "261--293",
  month =        nov,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-4/p261-brown/",
  abstract =     "Alpine is a file system that supports atomic
                 transactions and is designed to operate as a service on
                 a computer network. Alpine's primary purpose is to
                 store files that represent databases. An important
                 secondary goal is to store ordinary files representing
                 documents, program modules, and the like. Unlike other
                 file servers described in the literature, Alpine uses a
                 log-based technique to implement atomic file update.
                 Another unusual aspect of Alpine is that it performs
                 all communication via a general-purpose remote
                 procedure call facility. Both of these decisions have
                 worked out well. This paper describes Alpine's design
                 and implementation, and evaluates the system in light
                 of our experience to date. The Cedar language and
                 programming environment is used to develop Alpine.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Xerox Corp, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "alpine; cedar; computer networks; computer programming
                 languages; database systems; design; experimentation;
                 file servers; file system; reliability",
  subject =      "{\bf D.4.0} Software, OPERATING SYSTEMS, General,
                 Alpine. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Checkpoint/restart. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Cedar. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Backup procedures. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Distributed systems. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Distributed databases.",
}

@Article{DeMori:1985:RAB,
  author =       "Renato {De Mori} and R{\'e}gis Cardin",
  title =        "A Recursive Algorithm for Binary Multiplication and
                 its Implementation",
  journal =      j-TOCS,
  volume =       "3",
  number =       "4",
  pages =        "294--314",
  month =        nov,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-4/p294-de_mori/",
  abstract =     "A new recursive algorithm for deriving the layout of
                 parallel multipliers is presented. Based on this
                 algorithm, a network for performing multiplications of
                 two's complement numbers is proposed. The network can
                 be implemented in a synchronous or an asynchronous way.
                 If the factors to be multiplied have N bits, the area
                 complexity of the network is O(N**2) for practical
                 values of N as in the case of cellular multipliers. Due
                 to the design approach based on a recursive algorithm,
                 a time complexity O(log N) is achieved. It is shown how
                 the structure can be pipelined with period complexity
                 O(1) and used for single and double precision
                 multiplication.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Concordia Univ, Dep of Computer Science,
                 Montreal, Que, Can",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Algorithms; binary multiplication;
                 complexity; computer programming; computer systems,
                 digital --- Parallel Processing; design; performance;
                 recursive algorithm",
  subject =      "{\bf B.2.1} Hardware, ARITHMETIC AND LOGIC STRUCTURES,
                 Design Styles, Parallel. {\bf B.2.1} Hardware,
                 ARITHMETIC AND LOGIC STRUCTURES, Design Styles,
                 Pipeline. {\bf C.5.4} Computer Systems Organization,
                 COMPUTER SYSTEM IMPLEMENTATION, VLSI Systems.",
}

@Article{Chow:1985:DCM,
  author =       "Ching-Hua Chow and Mohamed G. Gouda and Simon S. Lam",
  title =        "A Discipline for Constructing Multiphase Communication
                 Protocols",
  journal =      j-TOCS,
  volume =       "3",
  number =       "4",
  pages =        "315--343",
  month =        nov,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-4/p315-chow/",
  abstract =     "Many communication protocols can be observed to go
                 through different phases performing a distinct function
                 in each phase. A multiphase model for such protocols is
                 presented. A phase is formally defined to be a network
                 of communicating finite-state machines with certain
                 desirable correctness properties; these include proper
                 termination and freedom from deadlocks and unspecified
                 receptions. A multifunction protocol is constructed by
                 first constructing separate phases to perform its
                 different functions. It is shown how to connect these
                 phases together to realize the multifunction protocol
                 so that the resulting network of communicating finite
                 state machines is also a phase (i. e., it possesses the
                 desirable properties defined for phases). The
                 modularity inherent in multiphase protocols facilitates
                 not only their construction but also their
                 understanding and modification. An abundance of
                 protocols have been found in the literature that can be
                 constructed as multiphase protocols. Three examples are
                 presented here: two versions of IBM's BSC protocol for
                 data link control and a token ring network protocol.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Texas at Austin, Dep of Computer
                 Sciences, Austin, TX, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; BSC protocols; computer networks; design;
                 multiphase communication protocols; Protocols; theory;
                 token ring network protocol; verification",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf B.4.4} Hardware,
                 INPUT/OUTPUT AND DATA COMMUNICATIONS, Performance
                 Analysis and Design Aids**, Formal models**. {\bf
                 B.4.4} Hardware, INPUT/OUTPUT AND DATA COMMUNICATIONS,
                 Performance Analysis and Design Aids**, Verification**.
                 {\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol verification. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf
                 D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and
                 Techniques, Modules and interfaces. {\bf D.2.2}
                 Software, SOFTWARE ENGINEERING, Design Tools and
                 Techniques, Structured programming**. {\bf D.2.4}
                 Software, SOFTWARE ENGINEERING, Software/Program
                 Verification, Correctness proofs. {\bf D.2.4} Software,
                 SOFTWARE ENGINEERING, Software/Program Verification,
                 Validation.",
}

@Article{Suzuki:1985:DME,
  author =       "Ichiro Suzuki and Tadao Kasami",
  title =        "A Distributed Mutual Exclusion Algorithm",
  journal =      j-TOCS,
  volume =       "3",
  number =       "4",
  pages =        "344--349",
  month =        nov,
  year =         "1985",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1985-3-4/p344-suzuki/",
  abstract =     "A distributed algorithm is presented that realizes
                 mutual exclusion among N nodes in a computer network.
                 The algorithm requires at most N message exchanges for
                 one mutual exclusion invocation. Accordingly, the delay
                 to invoke mutual exclusion is smaller than in an
                 algorithm of Ricart and Agrawala, which requires
                 2*(N-1) message exchanges per invocation. A drawback of
                 the algorithm is that the sequence numbers contained in
                 the messages are unbounded. It is shown that this
                 problem can be overcome by slightly increasing the
                 number of message exchanges.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Osaka Univ, Toyonaka, Jpn",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Algorithms; algorithms; computer networks; computer
                 programming; distributed mutual exclusion algorithm;
                 message exchange; process management",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Mutual exclusion. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems.",
}

@Article{Smith:1986:IGP,
  author =       "Connie U. Smith",
  title =        "Independent General Principles for Constructing
                 Responsive Software Systems",
  journal =      j-TOCS,
  volume =       "4",
  number =       "1",
  pages =        "1--31",
  month =        feb,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-1/p1-smith/",
  abstract =     "Three general principles are presented that can be
                 applied in early software life cycle stages for the
                 definition of software requirements and designs with
                 acceptable performance. They are genuine high-level
                 considerations for meeting responsiveness goals without
                 sacrificing understandability and maintainability, and
                 without increasing development time and cost. The
                 principles are derived from the interrelationships of
                 two performance models: a queueing network based on
                 computer system model and an execution graph software
                 model. The performance effect of each of the principles
                 is quantified using the models. Examples are given that
                 illustrate how they can be applied to software
                 systems.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Duke Univ, Dep of Computer Science, Durham, NC,
                 USA",
  classification = "722; 723; 921",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer software; computer systems, digital ---
                 Performance; design; mathematical models; Performance;
                 performance",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Design studies. {\bf D.2.10} Software,
                 SOFTWARE ENGINEERING, Design**. {\bf C.4} Computer
                 Systems Organization, PERFORMANCE OF SYSTEMS,
                 Performance attributes. {\bf D.0} Software, GENERAL.
                 {\bf D.2.9} Software, SOFTWARE ENGINEERING, Management,
                 Life cycle. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Modeling and prediction. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Operational
                 analysis.",
}

@Article{Herlihy:1986:QCR,
  author =       "Maurice Herlihy",
  title =        "A Quorum-Consensus Replication Method for Abstract
                 Data Types",
  journal =      j-TOCS,
  volume =       "4",
  number =       "1",
  pages =        "32--53",
  month =        feb,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-1/p32-herlihy/",
  abstract =     "Replication can enhance the availability of data in
                 distributed systems. This paper introduces a new method
                 for managing replicated data. Unlike many methods that
                 support replication only for uninterpreted files, this
                 method systematically exploits type-specific properties
                 of objects such as sets, queues, or directories to
                 provide more effective replication. Each operation
                 requires the cooperation of a certain number of sites
                 for its successful completion. A quorum for an
                 operation is any such set of sites. Necessary and
                 sufficient constraints on quorum intersections are
                 derived from an analysis of the data type's algebraic
                 structure. A reconfiguration method is proposed that
                 permits quorums to be changed dynamically. By taking
                 advantage of type-specific properties in a general and
                 systematic way, this method can realize a wider range
                 of availability properties and more flexible
                 reconfiguration than comparable replication methods.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Carnegie-Mellon Univ, Computer Science Dep,
                 Pittsburgh, PA, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "abstract data types; algorithms; computer operating
                 systems; computer programming languages; database
                 systems; reliability; replication method;
                 verification",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Abstract data types. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, Distributed
                 file systems. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance. {\bf H.2.4} Information
                 Systems, DATABASE MANAGEMENT, Systems, Distributed
                 databases. {\bf H.2.4} Information Systems, DATABASE
                 MANAGEMENT, Systems, Transaction processing.",
}

@Article{Joseph:1986:LCM,
  author =       "Thomas A. Joseph and Kenneth P. Birman",
  title =        "Low Cost Management of Replicated Data in
                 Fault-Tolerant Distributed Systems",
  journal =      j-TOCS,
  volume =       "4",
  number =       "1",
  pages =        "54--70",
  month =        feb,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-1/p54-joseph/",
  abstract =     "Many distributed systems replicate data for fault
                 tolerance or availability. In such systems, a logical
                 update on a data item results in a physical update on a
                 number of copies. The synchronization and communication
                 required to keep the copies of replicated data
                 consistent introduce a delay when operations are
                 performed. In this paper, we describe a technique that
                 relaxes the usual degree of synchronization, permitting
                 replicated data items to be updated concurrently with
                 other operations, while at the same time ensuring that
                 correctness is not violated. The additional concurrency
                 thus obtained results in better response time when
                 performing operations on replicated data. We also
                 discuss how this technique performs in conjunction with
                 a roll-back and a roll-forward failure recovery
                 mechanism.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Cornell Univ, Dep of Computer Science, Ithaca,
                 NY, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer systems, digital --- Distributed;
                 database systems; fault-tolerant distributed systems;
                 reliability; replicated data; roll-forward recovery;
                 update",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed databases. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Synchronization. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Checkpoint/restart. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf H.2.2} Information Systems,
                 DATABASE MANAGEMENT, Physical Design, Recovery and
                 restart. {\bf H.2.4} Information Systems, DATABASE
                 MANAGEMENT, Systems, Transaction processing.",
}

@Article{Kameda:1986:EJL,
  author =       "Hisao Kameda",
  title =        "Effects of Job Loading Policies for Multiprogramming
                 Systems in Processing a Job Stream",
  journal =      j-TOCS,
  volume =       "4",
  number =       "1",
  pages =        "71--106",
  month =        feb,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-1/p71-kameda/",
  abstract =     "The scheduling of jobs for multiprogramming systems
                 includes the selection of jobs to be loaded into memory
                 (job loading policy or memory schedule) and the
                 scheduling for CPU processing (CPU schedule). There has
                 been a successful empirical claim for the optimal CPU
                 schedule; its optimality has been proved in a Markovian
                 model of job-stream processing that uses the
                 first-come-first-loaded (FCFL) job loading policy. We
                 extend this model to gain insight into the effects of
                 job loading policies. Our investigation, supported by
                 numerical calculations, suggests that much more care
                 may be needed in implementing the job loading policy
                 that aims at the optimal processing capacity than in
                 implementing the optimal CPU schedule. This agrees with
                 what has been conjectured on the basis of empirical
                 studies.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Electro-Communications, Dep of Computer
                 Science, Chofu, Jpn",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer operating systems; computer systems
                 programming; finite memory size model; job loading
                 policies; multiple-resource system; performance;
                 theory; throughput",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Modeling techniques. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Scheduling. {\bf
                 D.4.2} Software, OPERATING SYSTEMS, Storage Management,
                 Allocation/deallocation strategies. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Modeling and
                 prediction. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Queueing theory. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Stochastic analysis.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management,
                 Multiprocessing/multiprogramming/multitasking.",
}

@Article{Carriero:1986:NLK,
  author =       "Nicholas Carriero and David Gelernter",
  title =        "The {S/Net}'s {Linda} kernel",
  journal =      j-TOCS,
  volume =       "4",
  number =       "2",
  pages =        "110--129",
  month =        may,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-2/p110-carriero/",
  abstract =     "Linda is a parallel programming language that differs
                 from other parallel languages in its simplicity and in
                 its support for distributed data structures. The S/Net
                 is a multicomputer, designed and built at AT\&T Bell
                 Laboratories, that is based on a fast, word-parallel
                 bus interconnect. We describe the Linda-supporting
                 communication kernel we have implemented on the S/Net.
                 The implementation suggests that Linda's unusual
                 share-memory-like communication primitives can be made
                 to run well in the absence of physically shared memory;
                 the simplicity of the language and of our
                 implementation's logical structure suggest that similar
                 Linda implementations might readily be constructed on
                 related architectures. We outline the language, and
                 programming methodologies based on distributed data
                 structures; we then describe the implementation, and
                 the performance both of the Linda primitives themselves
                 and of a simple S/Net-Linda matrix-multiplication
                 program designed to exercise them.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Yale Univ, New Haven, CT, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "communication kernel; computer programming languages;
                 computer systems, digital --- Parallel Processing; data
                 processing --- Data Structures; design; languages;
                 Linda parallel programming languages; S/Net",
  subject =      "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Concurrent programming
                 structures. {\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, Network operating
                 systems. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Message sending.",
}

@Article{Kronenberg:1986:VCC,
  author =       "Nancy P. Kronenberg and Henry M. Levy and William D.
                 Strecker",
  title =        "{VAXclusters}: a Closely-Coupled Distributed System",
  journal =      j-TOCS,
  volume =       "4",
  number =       "2",
  pages =        "130--146",
  month =        may,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-2/p130-kronenberg/",
  abstract =     "A VAXcluster is a highly available and extensible
                 configuration of VAX computers that operate as a single
                 system. To achieve performance in a multicomputer
                 environment, a new communications architecture,
                 communications hardware, and distributed software were
                 jointly designed. The software is a distributed version
                 of the VAX\slash VMS operating system that uses a
                 distributed lock manager to synchronize access to
                 shared resources. The communications hardware includes
                 a 70 megabit per second message-oriented interconnect
                 and an interconnect port that performs communications
                 tasks traditionally handled by software. Performance
                 measurements show this structure to be highly
                 efficient, for example, capable of sending and
                 receiving 3000 messages per second on a VAX-11\slash
                 780.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Digital Equipment Corp, Littleton, MA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer software; computer systems, digital; design;
                 Distributed; intersystem communication protocols;
                 network protocols; performance; reliability;
                 VAXclusters",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management. {\bf C.2.5} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Local
                 and Wide-Area Networks, Buses. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, Distributed
                 file systems. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance.",
}

@Article{Fitzgerald:1986:IVM,
  author =       "Robert Fitzgerald and Richard F. Rashid",
  title =        "The Integration of Virtual Memory Management and
                 Interprocess Communication in {Accent}",
  journal =      j-TOCS,
  volume =       "4",
  number =       "2",
  pages =        "147--177",
  month =        may,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-2/p147-fitzgerald/",
  abstract =     "The integration of virtual memory management and
                 interprocess communication in the Accent network
                 operating system kernel is examined. The design and
                 implementation of the Accent memory management system
                 is discussed and its performance, both on a series of
                 message-oriented bench-marks and in normal operation,
                 is analyzed in detail.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Carnegie-Mellon Univ, Pittsburgh, PA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "accent kernel; algorithms; computer operating systems;
                 data transmission; design; interprocess communication;
                 measurement; performance; Storage Allocation; virtual
                 memory management",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Virtual memory. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management, Message
                 sending. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Operational analysis.
                 {\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Distributed systems. {\bf B.1.5} Hardware,
                 CONTROL STRUCTURES AND MICROPROGRAMMING, Microcode
                 Applications, Firmware support of operating
                 systems/instruction sets**.",
}

@Article{Hoyme:1986:TSM,
  author =       "K. P. Hoyme and S. C. Bruell and P. V. Afshari and R.
                 Y. Kain",
  title =        "A Tree-Structured Mean Value Analysis Algorithm",
  journal =      j-TOCS,
  volume =       "4",
  number =       "2",
  pages =        "178--185",
  month =        may,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-2/p178-hoyme/",
  abstract =     "In a recent paper, S. S. Lam and Y. L Lien described
                 an algorithm called tree-convolution that can reduce
                 the space and computation time required for evaluating
                 sparse multiclass, product-form queueing networks. In
                 this paper, we develop an exact algorithm based on mean
                 value analysis (MVA) that is the counterpart of the
                 tree-convolution algorithm. The order of reduction in
                 storage and computation achieved by our new Tree-MVA
                 algorithm compared to the standard MVA algorithm is the
                 same order of reduction obtained by three-convolution
                 algorithm over that of the standard convolution
                 algorithm. Our Three-MVA algorithm preserves the
                 inherent simplicity of MVA based algorithms.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Honeywell Systems \& Research Cent, Minneapolis,
                 MN, USA",
  classification = "723; 921",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Algorithms; algorithms; computer programming; design;
                 mathematical techniques --- Trees; mean value analysis
                 algorithm; performance; tree-structured algorithm",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Operational analysis. {\bf C.2.1} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Architecture and Design. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Modeling
                 techniques. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Modeling and prediction. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Stochastic
                 analysis.",
}

@Article{Barbara:1986:VVA,
  author =       "Daniel Barbara and H{\'e}ctor Garc{\'\i}a-Molina",
  title =        "The Vulnerability of Vote Assignments",
  journal =      j-TOCS,
  volume =       "4",
  number =       "3",
  pages =        "187--213",
  month =        aug,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-3/p187-barbara/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; reliability",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Reliability, availability, and serviceability.
                 {\bf D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf B.1.3} Hardware, CONTROL
                 STRUCTURES AND MICROPROGRAMMING, Control Structure
                 Reliability, Testing, and Fault-Tolerance**,
                 Error-checking**.",
}

@Article{Iyer:1986:MMC,
  author =       "R. K. Iyer and D. J. Rossetti and M. C. Hsueh",
  title =        "Measurement and Modeling of Computer Reliability as
                 Affected by System Activity",
  journal =      j-TOCS,
  volume =       "4",
  number =       "3",
  pages =        "214--237",
  month =        aug,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-3/p214-iyer/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance; reliability",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Measurement techniques. {\bf C.4} Computer
                 Systems Organization, PERFORMANCE OF SYSTEMS,
                 Reliability, availability, and serviceability.",
}

@Article{Lazowska:1986:FAP,
  author =       "Edward D. Lazowska and John Zahorjan and David R.
                 Cheriton and Willy Zwaenepoel",
  title =        "File Access Performance of Diskless Workstations",
  journal =      j-TOCS,
  volume =       "4",
  number =       "3",
  pages =        "238--268",
  month =        aug,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-3/p238-lazowska/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Design studies. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Modeling and
                 prediction. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems.",
}

@Article{Archibald:1986:CCP,
  author =       "James Archibald and Jean-Loup Baer",
  title =        "Cache Coherence Protocols: Evaluation Using a
                 Multiprocessor Simulation Model",
  journal =      j-TOCS,
  volume =       "4",
  number =       "4",
  pages =        "273--298",
  month =        nov,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-4/p273-archibald/",
  abstract =     "Using simulation, we examine the efficiency of several
                 distributed, hardware-based solutions to the cache
                 coherence problem in shared-bus multiprocessors. For
                 each of the approaches, the associated protocol is
                 outlined. The simulation model is described, and
                 results from that model are presented. The magnitude of
                 the potential performance difference between the
                 various approaches indicates that the choice of
                 coherence solution is very important in the design of
                 an efficient shared-bus multiprocessor, since it may
                 limit the number of processors in the system.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Washington, Seattle, WA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "cache coherence protocols; computer simulation;
                 computer systems, digital; design; measurement;
                 Multiprocessing; performance; shared-bus
                 multiprocessor",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors),
                 Multiple-instruction-stream, multiple-data-stream
                 processors (MIMD). {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Measurement
                 techniques. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS, Modeling techniques. {\bf
                 D.4.2} Software, OPERATING SYSTEMS, Storage Management,
                 Distributed memories.",
}

@Article{Comer:1986:CBM,
  author =       "Douglas E. Comer and Larry L. Peterson",
  title =        "Conversation-Based Mail",
  journal =      j-TOCS,
  volume =       "4",
  number =       "4",
  pages =        "299--319",
  month =        nov,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-4/p299-comer/",
  abstract =     "A new message communication paradigm based on
                 conversations that provides an alternative to memo-and
                 conference-based mail is described. A
                 conversation-based message system groups messages into
                 conversations, and orders messages within a
                 conversation according to the context in which they
                 were written. The message context relation leads to an
                 efficient implementation of conversations in a
                 distributed environment and supports a natural ordering
                 of messages when viewed by the user. Experience with a
                 prototype demonstrates the workability of
                 conversation-based mail and suggests that conversations
                 provide a powerful tool for message communication.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Purdue Univ, West Lafayette, IN, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer systems, digital; conversation-based mail;
                 design; Distributed; electronic mail; human factors;
                 management; message systems",
  subject =      "{\bf H.4.3} Information Systems, INFORMATION SYSTEMS
                 APPLICATIONS, Communications Applications, Electronic
                 mail. {\bf H.4.3} Information Systems, INFORMATION
                 SYSTEMS APPLICATIONS, Communications Applications,
                 Computer conferencing, teleconferencing, and
                 videoconferencing.",
}

@Article{Badal:1986:DDD,
  author =       "D. Z. Badal",
  title =        "The Distributed Deadlock Detection Algorithm",
  journal =      j-TOCS,
  volume =       "4",
  number =       "4",
  pages =        "320--337",
  month =        nov,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-4/p320-badal/",
  abstract =     "We propose a distributed deadlock detection algorithm
                 for distributed computer systems. We consider two types
                 of resources, depending on whether the remote resource
                 lock granularity and mode can or cannot be determined
                 without access to the remote resource site. We present
                 the algorithm, its performance analysis, and an
                 informal argument about its correctness. The proposed
                 algorithm has a hierarchical design intended to detect
                 the most frequent deadlocks with maximum efficiency.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Hewlett--Packard Lab, Palo Alto, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer programming --- Algorithms;
                 computer systems, digital; deadlock detection; design;
                 Distributed; distributed algorithms; message
                 communication systems; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Deadlocks. {\bf H.2.2} Information Systems,
                 DATABASE MANAGEMENT, Physical Design, Deadlock
                 avoidance. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems.",
}

@Article{Carey:1986:PMC,
  author =       "Michael J. Carey and Waleed A. Muhanna",
  title =        "The Performance of Multiversion Concurrency Control
                 Algorithms",
  journal =      j-TOCS,
  volume =       "4",
  number =       "4",
  pages =        "338--378",
  month =        nov,
  year =         "1986",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1986-4-4/p338-carey/",
  abstract =     "This paper describes a simulation study of the
                 performance of several multiversion concurrency control
                 algorithms, investigating the extent to which they
                 provide increases in the level of concurrency and also
                 the CPU, I/O, and storage costs resulting from the use
                 of multiple versions. The algorithms are compared with
                 regard to performance with their single-version
                 counterparts and with each other. It is shown that each
                 algorithm offers significant performance improvements
                 despite the additional disk accesses involved in
                 accessing old versions of data; the nature of the
                 improvement depends on the algorithm in question. It is
                 also shown that the storage overhead for maintaining
                 old versions that may be required by ongoing
                 transactions is not all that large under most
                 circumstances.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Wisconsin, Madison, WI, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer programming --- Algorithms;
                 database systems; deadlock avoidance; design;
                 Distributed; experimentation; multiversion concurrency
                 control algorithms; performance; transaction
                 processing",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Simulation. {\bf H.2.2} Information Systems, DATABASE
                 MANAGEMENT, Physical Design, Deadlock avoidance. {\bf
                 H.2.2} Information Systems, DATABASE MANAGEMENT,
                 Physical Design, Recovery and restart. {\bf H.2.4}
                 Information Systems, DATABASE MANAGEMENT, Systems,
                 Transaction processing.",
}

@Article{Lamport:1987:FME,
  author =       "Leslie Lamport",
  title =        "A Fast Mutual Exclusion Algorithm",
  journal =      j-TOCS,
  volume =       "5",
  number =       "1",
  pages =        "1--11",
  month =        feb,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-1/p1-lamport/",
  abstract =     "A new solution to the mutual exclusion problem is
                 presented that, in the absence of contention, requires
                 only seven memory accesses. It assumes atomic reads and
                 atomic writes to shared registers.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Digital Equipment Corp, Palo Alto, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer programming --- Algorithms;
                 computer systems, digital; memory accesses;
                 Multiprocessing; mutual exclusion algorithm",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Mutual exclusion. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Deadlocks. {\bf
                 B.3.2} Hardware, MEMORY STRUCTURES, Design Styles,
                 Shared memory.",
}

@Article{Cheriton:1987:UUS,
  author =       "David R. Cheriton",
  title =        "{UIO}: a {Uniform I/O} System Interface for
                 Distributed Systems",
  journal =      j-TOCS,
  volume =       "5",
  number =       "1",
  pages =        "12--46",
  month =        feb,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-1/p12-cheriton/",
  abstract =     "The UIO (uniform I/O) system interface that has been
                 used for the past five years in the V distributed
                 operating system is described, with the focus on the
                 key design issues. This interface provides several
                 extensions beyond the I/O interface of UNIX, including
                 support for record I/O, locking, atomic transactions,
                 and replication, as well as attributes that indicate
                 whether optional semantics and operations are
                 available. Experience in using and implementing this
                 interface with a variety of different I/O services is
                 described, along with the performance of both local and
                 network I/O. It is concluded that the UIO interface
                 provides a uniform I/O system interface with
                 significant functionality, wide applicability, and no
                 significant performance penalty.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Stanford Univ, Stanford, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer interfaces; computer operating systems;
                 computer systems, digital --- Distributed; design;
                 experimentation; files input/output; interprocess
                 communication; performance; remote procedure call;
                 standardization; uniform I/O interface",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Input/output. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, Network operating
                 systems. {\bf D.4.7} Software, OPERATING SYSTEMS,
                 Organization and Design, Distributed systems. {\bf
                 C.2.0} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, General, Security and
                 protection (e.g., firewalls).",
}

@Article{Birman:1987:RCP,
  author =       "Kenneth P. Birman and Thomas A. Joseph",
  title =        "Reliable Communication in the Presence of Failures",
  journal =      j-TOCS,
  volume =       "5",
  number =       "1",
  pages =        "47--76",
  month =        feb,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-1/p47-birman/",
  abstract =     "The design and correctness of a communication facility
                 for a distributed computer system are reported on. The
                 facility provides support for fault-tolerant process
                 groups in the form of a family of reliable multicast
                 protocols that can be used in both local-and wide-area
                 networks. These protocols attain high levels of
                 concurrency, while respecting application-specific
                 delivery ordering constraints, and have varying cost
                 and performance that depend on the degree of ordering
                 desired. In particular, a protocol that enforces causal
                 delivery orderings is introduced and shown to be a
                 valuable alternative to conventional asynchronous
                 communication protocols. The facility also ensures that
                 the processes belonging to a fault-tolerant process
                 group will observe consistent orderings of events
                 affecting the group as a whole.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Cornell Univ, Ithaca, NY, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer networks --- Protocols; computer systems,
                 digital; Distributed; fault tolerance; multicast
                 protocols; performance; reliability",
  subject =      "{\bf D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed databases. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Synchronization. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Distributed databases.
                 {\bf H.2.2} Information Systems, DATABASE MANAGEMENT,
                 Physical Design, Recovery and restart. {\bf H.2.4}
                 Information Systems, DATABASE MANAGEMENT, Systems,
                 Concurrency. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Network communication. {\bf
                 C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications.",
}

@Article{Geist:1987:CDS,
  author =       "Robert Geist and Stephen Daniel",
  title =        "A Continuum of Disk Scheduling Algorithms",
  journal =      j-TOCS,
  volume =       "5",
  number =       "1",
  pages =        "77--92",
  month =        feb,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-1/p77-geist/",
  abstract =     "A continuum of disk scheduling algorithms, V(R),
                 having endpoints V(0) equals SSTF and V(1) equals SCAN,
                 is defined. V(R) maintains a current SCAN direction (in
                 or out) and services next the request with the smallest
                 effective distance. The effective distance of a request
                 that lies in the current direction is its physical
                 distance (in cylinders) from the read\slash write head.
                 The effective distance of a request in the opposite
                 direction is its physical distance plus R multiplied by
                 (total number of cylinders on the disk). By use of
                 simulation methods, it is shown that this definitional
                 continuum also provides a continuum in performance,
                 both with respect to the mean and with respect to the
                 standard deviation of request waiting time. For
                 objective functions that are linear combinations of the
                 two measures, mu w plus k sigma //w, intermediate
                 points of the continuum are seen to provide performance
                 uniformly superior to both SSTF and SCAN. A method of
                 implementing V(R) and the results of its experimental
                 use in a real system are presented.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Clemson Univ, Clemson, SC, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer operating systems; computer
                 programming --- Algorithms; computer simulation;
                 computer systems, digital; disk scheduling algorithms;
                 measurement; moving-head disk; performance;
                 Scheduling",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management, Input/output. {\bf
                 D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Measurements. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Modeling and prediction. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Simulation.",
}

@Article{Smith:1987:RDC,
  author =       "Alan Jay Smith",
  title =        "Remark on {``Disk Cache --- Miss Ratio Analysis and
                 Design Consideration''}",
  journal =      j-TOCS,
  volume =       "5",
  number =       "1",
  pages =        "93--93",
  month =        feb,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-1/p93-smith/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Secondary storage. {\bf B.3.2} Hardware,
                 MEMORY STRUCTURES, Design Styles, Cache memories.",
}

@Article{Watson:1987:GET,
  author =       "Richard W. Watson and Sandy A. Mamrak",
  title =        "Gaining Efficiency in Transport Services by
                 Appropriate Design and Implementation Choices",
  journal =      j-TOCS,
  volume =       "5",
  number =       "2",
  pages =        "97--120",
  month =        may,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-2/p97-watson/",
  abstract =     "This paper examines transport protocol mechanisms and
                 implementation issues and argues that general-purpose
                 transport protocols can be effective in a wide range of
                 distributed applications because (1) many of the
                 mechanisms used in the special-purpose protocols can
                 also be used in general-purpose protocol designs and
                 implementations, (2) special-purpose designs have
                 hidden costs, and (3) very special operating system
                 environments, overall system loads, application
                 response times, and interaction patterns are required
                 before general-purpose protocols are the main system
                 performance bottlenecks.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Lawrence Livermore Natl Lab, Livermore, CA,
                 USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer networks --- Protocols; computer systems,
                 digital; design; Distributed; economics; interprocess
                 communication; performance; standardization; transport
                 layer protocols; transport services",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture.",
}

@Article{Joyce:1987:MDS,
  author =       "Jeffrey Joyce and Greg Lomow and Konrad Slind and
                 Brian Unger",
  title =        "Monitoring Distributed Systems",
  journal =      j-TOCS,
  volume =       "5",
  number =       "2",
  pages =        "121--150",
  month =        may,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-2/p121-joyce/",
  abstract =     "The monitoring of distributed systems involves the
                 collection, interpretation, and display of information
                 concerning the interactions among concurrently
                 executing processes. This information and its display
                 can support the debugging, testing, performance
                 evaluation, and dynamic documentation of distributed
                 systems. General problems associated with monitoring
                 are outlined in this paper, and the architecture of a
                 general purpose, extensible, distributed monitoring
                 system is presented. Three approaches to the display of
                 process interactions are described: textual traces,
                 animated graphical traces, and a combination of aspects
                 of the textual and graphical approaches. The roles that
                 each of these approaches fulfills in monitoring and
                 debugging distributed systems are identified and
                 compared. Monitoring tools for collecting communication
                 statistics, detecting deadlock, controlling the
                 nondeterministic execution of distributed systems, and
                 for using protocol specifications in monitoring are
                 also described.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Calgary, Calgary, Alberta, Can",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer software --- Monitoring; computer systems,
                 digital; concurrent monitoring; deadlock; design;
                 Distributed; distributed monitoring; dynamic
                 documentation; human factors; measurement; protocol
                 specifications",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Monitors. {\bf D.2.2} Software, SOFTWARE ENGINEERING,
                 Design Tools and Techniques, User interfaces. {\bf
                 D.2.5} Software, SOFTWARE ENGINEERING, Testing and
                 Debugging. {\bf D.2.4} Software, SOFTWARE ENGINEERING,
                 Software/Program Verification, Assertion checkers. {\bf
                 D.2.7} Software, SOFTWARE ENGINEERING, Distribution,
                 Maintenance, and Enhancement, Documentation. {\bf
                 D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed applications. {\bf D.2.6} Software,
                 SOFTWARE ENGINEERING, Programming Environments.",
}

@Article{Glasgow:1987:DPF,
  author =       "Janice I. Glasgow and Glenn H. MacEwen",
  title =        "The Development and Proof of a Formal Specification
                 for a Multilevel Secure System",
  journal =      j-TOCS,
  volume =       "5",
  number =       "2",
  pages =        "151--184",
  month =        may,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-2/p151-glasgow/",
  abstract =     "This paper describes current work on the design and
                 specification of a multilevel secure distributed system
                 called SNet. It discusses security models in general,
                 the various problems of information flows in SNet, and
                 the abstract and concrete security model components for
                 SNet. It also introduces Lucid as a language for
                 specifying distributed systems. The model components
                 are expressed in Lucid; these Lucid partial
                 specifications are shown to be correct with respect to
                 the formal model, and the two model components are
                 shown to be consistent.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Queen's Univ, Kingston, Ont, Can",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer operating systems; computer programming
                 languages; computer systems, digital; data processing
                 --- Security of Data; Distributed; formal
                 specification; lucid; multilevel secure system;
                 security; SNet; verification",
  subject =      "{\bf C.2.0} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, General, Security and
                 protection (e.g., firewalls). {\bf D.3.2} Software,
                 PROGRAMMING LANGUAGES, Language Classifications, LUCID.
                 {\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Information flow controls. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, SNet.",
}

@Article{Schwan:1987:HPO,
  author =       "Karsten Schwan and Tom Bihari and Bruce W. Weide and
                 Gregor Taulbee",
  title =        "High-Performance Operating System Primitives for
                 Robotics and Real-Time Control Systems",
  journal =      j-TOCS,
  volume =       "5",
  number =       "3",
  pages =        "189--231",
  month =        aug,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-3/p189-schwan/",
  abstract =     "The Generalized Executive for real-time Multiprocessor
                 applications (GEM) is an operating system that
                 addresses several requirements of operating software.
                 First, when using GEM, programmers can select one of
                 two different types of tasks differing in size, called
                 processes and microprocesses. Second, the scheduling
                 calls offered by GEM permit the implementation of
                 several models of task interaction. Third, GEM supports
                 multiple models of communication with a parameterized
                 communication mechanism. Fourth, GEM is closely coupled
                 to prototype real-time programming environments that
                 provide programming support for the models of
                 computation offered by the operating system. GEM is
                 being used on a multiprocessor with robotics
                 application software of substantial size and
                 complexity.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Ohio State Univ, Columbus, OH, USA",
  classification = "723; 731",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer operating systems; computer systems, digital
                 --- Multiprocessing; control systems --- Computer
                 Applications; generalized executive for real-time
                 multiprocessor applications; high-performance operating
                 system primitives; real-time control systems;
                 robotics",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Real-time systems and embedded systems.
                 {\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS, Process
                 control systems. {\bf C.3} Computer Systems
                 Organization, SPECIAL-PURPOSE AND APPLICATION-BASED
                 SYSTEMS, Real-time and embedded systems. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS,
                 Design studies. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Scheduling. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Message sending. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Measurements.
                 {\bf J.7} Computer Applications, COMPUTERS IN OTHER
                 SYSTEMS, Industrial control. {\bf J.7} Computer
                 Applications, COMPUTERS IN OTHER SYSTEMS, Process
                 control. {\bf J.7} Computer Applications, COMPUTERS IN
                 OTHER SYSTEMS, Real time. {\bf D.4.0} Software,
                 OPERATING SYSTEMS, General.",
}

@Article{Harter:1987:RTL,
  author =       "Paul K. {Harter, Jr.}",
  title =        "Response Times in Level-Structured Systems",
  journal =      j-TOCS,
  volume =       "5",
  number =       "3",
  pages =        "232--248",
  month =        aug,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-3/p232-harter/",
  abstract =     "Real-time programs are among the most critical
                 programs in use today, yet they are also among the
                 worst understood and the most difficult to verify.
                 Validation of real-time systems is nonetheless
                 extremely important in view of the high costs
                 associated with failure in typical application areas.
                 We present here a method for deriving response-time
                 properties in complex systems with a level structure
                 based on priority. The method involves a level-by-level
                 examination of the system, in which information
                 distilled from each successive level is used to adjust
                 the results for later levels. The results obtained at
                 each level of the system are not affected by later
                 analyses, which obviates having to consider a complex
                 system as a whole.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Colorado, Boulder, CO, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer software --- Reliability; computer systems,
                 digital; design; level-structured systems; performance;
                 Performance; real-time systems; reliability; response
                 times; verification",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Modeling and prediction. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design,
                 Hierarchical design**. {\bf D.4.7} Software, OPERATING
                 SYSTEMS, Organization and Design, Real-time systems and
                 embedded systems. {\bf J.7} Computer Applications,
                 COMPUTERS IN OTHER SYSTEMS, Industrial control. {\bf
                 J.7} Computer Applications, COMPUTERS IN OTHER SYSTEMS,
                 Process control. {\bf J.7} Computer Applications,
                 COMPUTERS IN OTHER SYSTEMS, Real time. {\bf D.2.4}
                 Software, SOFTWARE ENGINEERING, Software/Program
                 Verification, Validation. {\bf D.2.4} Software,
                 SOFTWARE ENGINEERING, Software/Program Verification,
                 Validation.",
}

@Article{Herlihy:1987:CVA,
  author =       "Maurice Herlihy",
  title =        "Concurrency Versus Availability: Atomicity Mechanisms
                 for Replicated Data",
  journal =      j-TOCS,
  volume =       "5",
  number =       "3",
  pages =        "249--274",
  month =        aug,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-3/p249-herlihy/",
  abstract =     "A replicated object is a typed data object that is
                 stored redundantly at multiple locations to enhance
                 availability. Most techniques for managing replicated
                 data have a two-level structure: At the higher level, a
                 replica-control protocol reconstructs the object's
                 state from its distributed components, and at the lower
                 level, a standard concurrency-control protocol
                 synchronizes accesses to the individual components.
                 This paper explores an alternative approach to managing
                 replicated data by presenting two replication methods
                 in which concurrency control and replica management are
                 handled by a single integrated protocol. These
                 integrated protocols permit more concurrency than
                 independent protocols, and they allow availability and
                 concurrency to be traded off: Constraints on
                 concurrency may be relaxed if constraints on
                 availability are tightened, and vice versa. In general,
                 constraints on concurrency and availability cannot be
                 minimized simultaneously.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Carnegie-Mellon Univ, Pittsburgh, PA, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "atomicity mechanisms; computer programming ---
                 Algorithms; database systems; replicated data",
  subject =      "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability. {\bf H.2.4}
                 Information Systems, DATABASE MANAGEMENT, Systems. {\bf
                 H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Distributed databases. {\bf H.2.4} Information
                 Systems, DATABASE MANAGEMENT, Systems, Transaction
                 processing. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management, Concurrency.",
}

@Article{Kirkman:1987:OCP,
  author =       "W. Worth Kirkman",
  title =        "An Optimized Contention Protocol for Broadband
                 Networks",
  journal =      j-TOCS,
  volume =       "5",
  number =       "3",
  pages =        "275--283",
  month =        aug,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-3/p275-kirkman/",
  abstract =     "This paper describes the concepts underlying an
                 alternative link-level protocol for broadband local
                 networks. The protocol uses implicit slotting of the
                 contention channel to support larger networks, improve
                 performance, and provide reliable distributed collision
                 recognition without reinforcement. It is designed such
                 that compatible interfaces to existing CSMA\slash
                 CD-based systems can be provided.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "MITRE Corp, McLean, VA, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; broadband networks; carrier sense multiple
                 access/collision detection network; computer networks;
                 CSMA/CD-based systems; data transmission; local
                 networks; optimized contention protocol; performance;
                 Protocols",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols.
                 {\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Packet-switching networks.",
}

@Article{Sanders:1987:ISD,
  author =       "Beverly A. Sanders",
  title =        "The Information Structure of Distributed Mutual
                 Exclusion Algorithms",
  journal =      j-TOCS,
  volume =       "5",
  number =       "3",
  pages =        "284--299",
  month =        aug,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-3/p284-sanders/",
  abstract =     "The concept of an information structure is introduced
                 as a unifying principle behind several of the numerous
                 algorithms that have been proposed for the distributed
                 mutual exclusion problem. This approach allows the
                 development of a generalized mutual exclusion algorithm
                 that accepts a particular information structure at
                 initialization and realizes both known and new
                 algorithms as special cases. Two simple performance
                 metrics of a realized algorithm can be obtained
                 directly from the information structure. A new failure
                 recovery mechanism called local recovery, which
                 requires no coordination between nodes and no
                 additional messages beyond that needed for failure
                 detection, is introduced.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Maryland, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer programming --- Algorithms;
                 computer systems, digital; design; Distributed;
                 distributed mutual exclusion algorithms; failure
                 recovery; local recovery; performance; reliability;
                 theory",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Reliability,
                 availability, and serviceability. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Mutual
                 exclusion. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance.",
}

@Article{Thiebaut:1987:FC,
  author =       "Dominique Thiebaut and Harold S. Stone",
  title =        "Footprints in the Cache",
  journal =      j-TOCS,
  volume =       "5",
  number =       "4",
  pages =        "305--329",
  month =        nov,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-4/p305-thiebaut/",
  abstract =     "This paper develops an analytical model for
                 cache-reload transients and compares the model to
                 observations based on several address traces. The
                 cache-reload transient is the set of cache misses that
                 occur when a process is reinitiated after being
                 suspended temporarily. For example, an interrupt
                 program that runs periodically experiences a reload
                 transient at each initiation. The reload transient
                 depends on the cache size and on the sizes of the
                 footprints in the cache of the competing programs,
                 where a program footprint is defined to be the set of
                 lines in the cache in active use by the program. The
                 model shows that the size of the transient is related
                 to the normal distribution function. A simulation based
                 on program-address traces shows excellent agreement
                 between the model and the observations.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Massachusetts, MA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "address traces; cache-reload transients; computer
                 architecture; computer operating systems --- Storage
                 Allocation; data storage units; design;
                 experimentation; memory structures; performance;
                 program footprint; theory; trace-driven simulation",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Formal models**. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Simulation**. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS, Modeling techniques. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.2} Software, OPERATING SYSTEMS, Storage Management,
                 Swapping**.",
}

@Article{Falcone:1987:PIL,
  author =       "Joseph R. Falcone",
  title =        "A Programmable Interface Language for Heterogeneous
                 Distributed Systems",
  journal =      j-TOCS,
  volume =       "5",
  number =       "4",
  pages =        "330--351",
  month =        nov,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-4/p330-falcone/",
  abstract =     "The performance requirements of systems of
                 personal-computer workstations places a strain on
                 traditional approaches to network architecture. The
                 integration of diverse systems into this environment
                 introduces functional compatibility issues that are not
                 present in homogeneous networks. This paper proposes a
                 distributed system architecture in which communication
                 follows a programming paradigm. In this architecture a
                 programming language provides remote service interfaces
                 for the heterogeneous distributed system environment.
                 This language is a flexible and efficient medium for
                 implementing service function protocols. In essence,
                 clients and servers communication by programming one
                 another.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Digital Equipment Corp",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer interfaces; computer networks --- Local
                 Networks; computer programming languages; computer
                 systems, digital --- Distributed; computers, personal;
                 heterogeneous distributed systems; personal computer
                 workstation networks; programmable interface language",
}

@Article{Koch:1987:DFA,
  author =       "Philip D. L. Koch",
  title =        "Disk File Allocation Based on the Buddy System",
  journal =      j-TOCS,
  volume =       "5",
  number =       "4",
  pages =        "352--370",
  month =        nov,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-4/p352-koch/",
  abstract =     "A variant of the binary buddy system that reduces
                 fragmentation is described. Files are allocated on up
                 to t extents, and inoptimally allocated files are
                 periodically reallocated. The Dartmouth Time-Sharing
                 System (DTSS) uses this method. Several installations,
                 representing different classes of workload are studied
                 to measure the method's performance. The results
                 indicate that compared to the file layout method used
                 by UNIX, the buddy system results in more efficient
                 access but less efficient utilization of disk space. As
                 disks become larger and less expensive per byte,
                 strategies that achieve efficient I/O throughput at the
                 expense of some storage loss become increasingly
                 attractive.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Dartmouth Coll, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "buddy system; computer operating systems; data
                 processing --- File Organization; disk file allocation;
                 dynamic memory management; dynamic storage allocation;
                 file system design; measurement; performance; Storage
                 Allocation",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, File organization. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, Access
                 methods. {\bf D.4.2} Software, OPERATING SYSTEMS,
                 Storage Management, Allocation/deallocation strategies.
                 {\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Secondary storage. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Measurements. {\bf
                 H.3.2} Information Systems, INFORMATION STORAGE AND
                 RETRIEVAL, Information Storage, File organization. {\bf
                 E.5} Data, FILES, Organization/structure.",
}

@Article{Herzberg:1987:PPS,
  author =       "Amir Herzberg and Shlomit S. Pinter",
  title =        "Public Protection of Software",
  journal =      j-TOCS,
  volume =       "5",
  number =       "4",
  pages =        "371--393",
  month =        nov,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-4/p371-herzberg/",
  abstract =     "One of the overwhelming problems that software
                 producers must contend with is the unauthorized use and
                 distribution of their products. Copyright laws
                 concerning software are rarely enforced, thereby
                 causing major losses to the software companies.
                 Technical means of protecting software from illegal
                 duplication are required, but the available means are
                 imperfect. We present protocols that enable software
                 protection, without causing substantial overhead in
                 distribution and maintenance. The protocols may be
                 implemented by a conventional cryptosystem, such as the
                 DES, or by a public key cryptosystem, such as the RSA.
                 Both implementations are proved to satisfy required
                 security criteria.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Technion-Israel Inst of Technology, Isr",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; computer software; cryptographic
                 protocols; cryptography; design; Protection; public key
                 cryptosystems; security; security protocols; single key
                 cryptosystems",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Cryptographic controls. {\bf K.5.1}
                 Computing Milieux, LEGAL ASPECTS OF COMPUTING,
                 Hardware/Software Protection. {\bf E.3} Data, DATA
                 ENCRYPTION, Public key cryptosystems. {\bf D.4.6}
                 Software, OPERATING SYSTEMS, Security and Protection.",
}

@Article{Babaoglu:1987:RCB,
  author =       "{\"O}zalp Babao{\u{g}}lu",
  title =        "On the Reliability of Consensus-Based Fault-Tolerant
                 Distributed Computing Systems",
  journal =      j-TOCS,
  volume =       "5",
  number =       "4",
  pages =        "394--416",
  month =        nov,
  year =         "1987",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1987-5-4/p394-babaoglu/",
  abstract =     "Using a stochastic model of processor failure times,
                 we investigate design choices such as replication
                 level, protocol running time, randomized versus
                 deterministic protocols, fault detection, and
                 authentication. We use the probability with which a
                 system produces the correct output as our evaluation
                 criterion. This contrasts with previous fault-tolerance
                 results that guarantee correctness only if the
                 percentage of faulty processors in the system can be
                 bounded. Our results reveal some subtle and
                 counterintuitive interactions between the design
                 parameters and system reliability.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Cornell Univ, USA",
  classification = "722; 723; 913",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Byzantine agreement; computer systems, digital;
                 design; deterministic protocols; distributed consensus;
                 Fault Tolerant Capability; fault-tolerant distributed
                 system; performance; randomized protocols;
                 reliability",
  subject =      "{\bf B.1.3} Hardware, CONTROL STRUCTURES AND
                 MICROPROGRAMMING, Control Structure Reliability,
                 Testing, and Fault-Tolerance**, Redundant design**.
                 {\bf B.3.4} Hardware, MEMORY STRUCTURES, Reliability,
                 Testing, and Fault-Tolerance**, Redundant design**.
                 {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Reliability, availability, and
                 serviceability.",
}

@Article{Jones:1988:PSI,
  author =       "Anita K. Jones",
  title =        "Preface: Special Issue on Operating Systems
                 Principles",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "1--2",
  month =        feb,
  year =         "1988",
  bibdate =      "Thu Jan 14 11:09:14 1999",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Terry:1988:MSV,
  author =       "Douglas B. Terry and Daniel C. Swinehart",
  title =        "Managing Stored Voice in the {Etherphone} System",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "3--27",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-1/p3-terry/",
  abstract =     "The voice manager in the Etherphone system provides
                 facilities for recording, editing, and playing stored
                 voice in a distributed personal-computing environment.
                 To facilitate sharing, the voice manager stores voice
                 on a special voice file server that is accessible via
                 the local internet. Operations for editing a passage of
                 recorded voice simply build persistent data structures
                 to represent the edited voice. These data structures,
                 implementing an abstraction called voice ropes, are
                 stored in a server database and consist of lists of
                 intervals within voice files. Clients refer to voice
                 ropes solely by reference. Interests, additional
                 persistent data structures maintained by the server,
                 provide a sort of directory service for managing the
                 voice ropes that have been created as well as a
                 reliable reference-counting mechanism, permitting the
                 garbage collection of voice ropes that are no longer
                 needed.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "XEROX Palo Alto Research Cent, Palo Alto, CA,
                 USA",
  classification = "718; 723",
  conference =   "1987 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "computer networks; computer operating systems;
                 computer systems, digital --- Distributed; data
                 processing --- Data Structures; design; digital
                 communication systems; electronic mail; Etherphone
                 system; management; performance; security; voice
                 editing; voice file server; voice manager; Voice/Data
                 Integrated Services",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY, USA",
  subject =      "{\bf H.4.3} Information Systems, INFORMATION SYSTEMS
                 APPLICATIONS, Communications Applications. {\bf D.4.2}
                 Software, OPERATING SYSTEMS, Storage Management,
                 Allocation/deallocation strategies. {\bf D.4.2}
                 Software, OPERATING SYSTEMS, Storage Management,
                 Storage hierarchies. {\bf D.4.3} Software, OPERATING
                 SYSTEMS, File Systems Management. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection, Access
                 controls. {\bf D.4.6} Software, OPERATING SYSTEMS,
                 Security and Protection, Cryptographic controls. {\bf
                 E.2} Data, DATA STORAGE REPRESENTATIONS. {\bf H.2.8}
                 Information Systems, DATABASE MANAGEMENT, Database
                 Applications. {\bf H.4.3} Information Systems,
                 INFORMATION SYSTEMS APPLICATIONS, Communications
                 Applications, Electronic mail. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems.",
}

@Article{Chang:1988:SAP,
  author =       "Albert Chang and Mark F. Mergen",
  title =        "801 Storage: Architecture and Programming",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "28--50",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-1/p28-chang/",
  abstract =     "Based on novel architecture, the 801 minicomputer
                 project has developed a low-level storage manager that
                 can significantly simplify storage programming in
                 subsystems and applications. The storage manager
                 embodies three ideas: (1) large virtual storage, to
                 contain all temporary data and permanent files for the
                 active programs; (2) the innovation of database
                 storage, which has implicit properties of access
                 serializability and atomic update, similar to those of
                 database transaction systems; and (3) access to all
                 storage, including files, by the usual operations and
                 types of a high-level programming language. The IBM RT
                 PC implements the hardware architecture necessary for
                 these storage facilities in its storage controller
                 (MMU). The storage manager and language elements
                 required, as well as subsystems and applications that
                 use them, have been implemented and studied in a
                 prototype operating system called CPR, that runs on the
                 RT PC. Low cost and good performance are achieved in
                 both hardware and software. The design is intended to
                 be extensible across a wide performance\slash cost
                 spectrum.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "IBM T. J. Watson Research Cent, Yorktown
                 Heights, NY, USA",
  classification = "723",
  conference =   "1987 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "801 minicomputer project; algorithms; computer
                 architecture; computer operating systems; computers,
                 minicomputer; CPR operating system; design;
                 experimentation; IBM RT PC; low-level storage manager;
                 performance",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY, USA",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Virtual memory. {\bf C.1.1} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Single Data
                 Stream Architectures, RISC. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features. {\bf D.4.2} Software, OPERATING SYSTEMS,
                 Storage Management. {\bf D.4.3} Software, OPERATING
                 SYSTEMS, File Systems Management. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design.",
}

@Article{Howard:1988:SPD,
  author =       "John H. Howard and Michael L. Kazar and Sherri G.
                 Menees and David A. Nichols and M. Satyanarayanan and
                 Robert N. Sidebotham and Michael J. West",
  title =        "Scale and Performance in a Distributed File System",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "51--81",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-1/p51-howard/",
  abstract =     "The Andrew File System is a location-transparent
                 distributed file system that will eventually span more
                 than 5000 workstations at Carnegie Mellon University.
                 Large scale affects performance and complicates system
                 operation. In this paper we present observations of a
                 prototype implementation, motivate changes in the areas
                 of cache validation, server process structure, name
                 translation, and low-level storage representation, and
                 quantitatively demonstrate Andrew's ability to scale
                 gracefully. We establish the importance of whole-file
                 transfer and caching in Andrew by comparing its
                 performance with that of Sun Microsystem's NFS file
                 system. We also show how the aggregation of files into
                 volumes improves the operability of the system.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Carnegie Mellon Univ, Pittsburgh, PA, USA",
  classification = "723",
  conference =   "1987 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Andrew File System; computer operating systems;
                 computer systems, digital --- Distributed; design;
                 distributed file system; experimentation; file
                 transfer; measurement; performance",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY, USA",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Distributed file systems. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance,
                 Measurements.",
}

@Article{Haskin:1988:RMQ,
  author =       "Roger Haskin and Yoni Malachi and Wayne Sawdon and
                 Gregory Chan",
  title =        "Recovery Management in {QuickSilver}",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "82--108",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-1/p82-haskin/",
  abstract =     "This paper describes QuickSilver, which uses atomic
                 transactions as a unified failure recovery mechanism
                 for a client-server structured distributed system.
                 Transactions allow failure atomicity for related
                 activities at a single server or at a number of
                 independent servers. Rather than bundling transaction
                 management into a dedicated language or recoverable
                 object manager, QuickSilver exposes the basic commit
                 protocol and log recovery primitives, allowing clients
                 and servers to tailor their recovery techniques to
                 their specific needs. Servers can implement their own
                 log recovery protocols rather than being required to
                 use a system-defined protocol. These decisions allow
                 servers to make their own choices to balance
                 simplicity, efficiency, and recoverability.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "IBM, Almaden Research Cent, San Jose, CA, USA",
  classification = "723",
  conference =   "1987 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "atomic transactions; computer operating systems;
                 computer systems, digital --- Distributed; design;
                 experimentation; failure atomicity; performance;
                 QuickSilver; recovery management; reliability",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY, USA",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Distributed file systems. {\bf D.4.3}
                 Software, OPERATING SYSTEMS, File Systems Management,
                 File organization. {\bf D.4.3} Software, OPERATING
                 SYSTEMS, File Systems Management, Maintenance**. {\bf
                 D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, QuickSilver. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability,
                 Checkpoint/restart. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Distributed databases.
                 {\bf H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Transaction processing. {\bf H.2.2}
                 Information Systems, DATABASE MANAGEMENT, Physical
                 Design, Recovery and restart.",
}

@Article{Jul:1988:FGM,
  author =       "Eric Jul and Henry Levy and Norman Hutchinson and
                 Andrew Black",
  title =        "Fine-Grained Mobility in the {Emerald} System",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "109--133",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-1/p109-jul/",
  abstract =     "Emerald is an object-based language and system
                 designed for the construction of distributed programs.
                 An explicit goal of Emerald is support for object
                 mobility; objects in Emerald can freely move within the
                 system to take advantage of distribution and
                 dynamically changing environments. We say that Emerald
                 has fine-grained mobility because Emerald objects can
                 be small data objects as well as process objects.
                 Fine-grained mobility allows us to apply mobility in
                 new ways but presents implementation problems as well.
                 This paper discusses the benefits of fine-grained
                 mobility, the Emerald language and run-time mechanisms
                 that support mobility, and techniques for implementing
                 mobility that do not degrade the performance of local
                 operations. Performance measurements of the current
                 implementation are included.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of Washington, Seattle, WA, USA",
  classification = "723",
  conference =   "1987 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer operating systems; computer programming
                 languages; computer systems, digital --- Distributed;
                 design; distributed languages; emerald; languages;
                 measurement; object-oriented languages; performance;
                 process mobility",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY, USA",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed applications. {\bf D.2.6} Software,
                 SOFTWARE ENGINEERING, Programming Environments. {\bf
                 D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Abstract data types. {\bf
                 D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Control structures. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design, Distributed systems. {\bf D.3.2} Software,
                 PROGRAMMING LANGUAGES, Language Classifications,
                 Emerald.",
}

@Article{Nelson:1988:CSN,
  author =       "Michael N. Nelson and Brent B. Welch and John K.
                 Ousterhout",
  title =        "Caching in the {Sprite} Network File System",
  journal =      j-TOCS,
  volume =       "6",
  number =       "1",
  pages =        "134--154",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-1/p134-nelson/",
  abstract =     "The Sprite network operating system uses large
                 main-memory disk block caches to achieve high
                 performance in its file system. It provides
                 non-write-through file caching on both client and
                 server machines. A simple cache consistency mechanism
                 permits files to be shared by multiple clients without
                 danger of stale data. In order to allow the file cache
                 to occupy as much memory as possible, the file system
                 of each machine negotiates with the virtual memory
                 system over physical memory usage and changes the size
                 of the file cache dynamically. Benchmark programs
                 indicate that client caches allow diskless Sprite
                 workstations to perform within 0-12 percent of
                 workstations with disks. In addition, client caching
                 reduces server loading by 50 percent and network
                 traffic by 90 percent.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of California at Berkeley, Berkeley, CA,
                 USA",
  classification = "723",
  conference =   "1987 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "cache consistency; computer operating systems;
                 computer systems, digital --- Distributed; design;
                 distributed file caching; distributed file systems;
                 measurement; performance; sprite network",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management. {\bf D.4.3} Software, OPERATING SYSTEMS,
                 File Systems Management, Distributed file systems. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design, Distributed systems. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Measurements. {\bf
                 D.4.2} Software, OPERATING SYSTEMS, Storage Management,
                 Distributed memories. {\bf D.4.2} Software, OPERATING
                 SYSTEMS, Storage Management, Main memory. {\bf D.4.2}
                 Software, OPERATING SYSTEMS, Storage Management,
                 Secondary storage. {\bf D.4.2} Software, OPERATING
                 SYSTEMS, Storage Management, Virtual memory.",
}

@Article{Snodgrass:1988:RAM,
  author =       "Richard Snodgrass",
  title =        "A Relational Approach to Monitoring Complex Systems",
  journal =      j-TOCS,
  volume =       "6",
  number =       "2",
  pages =        "157--196",
  month =        may,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-2/p157-snodgrass/",
  abstract =     "Traditional monitoring techniques are inadequate when
                 monitoring complex systems such as multiprocessors or
                 distributed systems. A new approach is described in
                 which a historical database forms the conceptual basis
                 for the information processed by the monitor. This
                 approach permits advances in specifying the low-level
                 data collection, specifying the analysis of the
                 collected data, performing the analysis, and displaying
                 the results. Two prototype implementations demonstrate
                 the feasibility of the approach.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Univ of North Carolina, NC, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer systems, digital; data processing --- Data
                 Reduction and Analysis; database systems ---
                 Relational; design; distributed systems;
                 experimentation; languages; low-level data collection;
                 measurement; Monitoring; multiprocessors; performance",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Monitors. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed applications. {\bf D.2.6} Software,
                 SOFTWARE ENGINEERING, Programming Environments. {\bf
                 D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Measurements. {\bf D.2.5} Software, SOFTWARE
                 ENGINEERING, Testing and Debugging. {\bf H.2.3}
                 Information Systems, DATABASE MANAGEMENT, Languages,
                 Query languages. {\bf H.2.3} Information Systems,
                 DATABASE MANAGEMENT, Languages, QUEL. {\bf H.2.1}
                 Information Systems, DATABASE MANAGEMENT, Logical
                 Design, Data models.",
}

@Article{Sandhu:1988:NTD,
  author =       "Ravinderpal S. Sandhu",
  title =        "The {NTree}: a Two Dimension Partial Order for
                 Protection Groups",
  journal =      j-TOCS,
  volume =       "6",
  number =       "2",
  pages =        "197--222",
  month =        may,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-2/p197-sandhu/",
  abstract =     "The benefits of providing access control with groups
                 of users rather than with individuals as the unit of
                 granularity are enhanced if the groups are organized in
                 a subgroup partial order. A class of such partial
                 orders, called ntrees, is defined by using a forest of
                 rooted trees or inverted rooted trees as basic partial
                 orders and combining these by refinement. Refinement
                 explodes an existing group into a partially ordered
                 ntree of new groups while maintaining the same
                 relationship between each new group and the nonexploded
                 groups that the exploded group had. Examples are
                 discussed to show the practical significance of ntrees
                 and the refinement operation.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Ohio State Univ, OH, USA",
  classification = "722; 723; 921",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "access control; computer systems, digital; data
                 processing --- Security of Data; design; management;
                 mathematical techniques --- Trees; ntree; protection
                 groups; security; theory; two-dimensional partial
                 order",
  subject =      "{\bf H.2.0} Information Systems, DATABASE MANAGEMENT,
                 General, Security, integrity, and protection**. {\bf
                 D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection. {\bf K.6.m} Computing Milieux, MANAGEMENT
                 OF COMPUTING AND INFORMATION SYSTEMS, Miscellaneous,
                 Security*. {\bf H.3.3} Information Systems, INFORMATION
                 STORAGE AND RETRIEVAL, Information Search and
                 Retrieval, Search process. {\bf I.2.8} Computing
                 Methodologies, ARTIFICIAL INTELLIGENCE, Problem
                 Solving, Control Methods, and Search, Graph and tree
                 search strategies.",
}

@Article{Gross:1988:MEM,
  author =       "Thomas R. Gross and John L. Hennessy and Steven A.
                 Przybylski and Christopher Rowen",
  title =        "Measurement and Evaluation of the {MIPS} Architecture
                 and Processor",
  journal =      j-TOCS,
  volume =       "6",
  number =       "3",
  pages =        "229--257",
  month =        aug,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-3/p229-gross/",
  abstract =     "MIPS is a 32-bit processor architecture that has been
                 implemented as an nMOS VLSI chip. The instruction set
                 architecture is RISC-based. Close coupling with
                 compilers and efficient use of the instruction set by
                 compiled programs were goals of the architecture. The
                 MIPS architecture requires that the software implement
                 some constraints in the design that are normally
                 considered part of the hardware implementation. This
                 power presents experimental results on the
                 effectiveness of this processor as a program host.
                 Using sets of large and small benchmarks, the
                 instruction and operand usage patterns are examined
                 both for optimized and unoptimized code.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Stanford Univ, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "32-bit microprocessor; benchmarks; computer
                 architecture --- Performance; computers, microcomputer;
                 design; Evaluation; experimentation; measurement; MIPS;
                 performance",
  subject =      "{\bf C.1.1} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Single Data Stream Architectures,
                 Pipeline processors**. {\bf C.0} Computer Systems
                 Organization, GENERAL, Instruction set design. {\bf
                 C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Design studies. {\bf C.5.4} Computer Systems
                 Organization, COMPUTER SYSTEM IMPLEMENTATION, VLSI
                 Systems.",
}

@Article{Gifford:1988:RPP,
  author =       "David K. Gifford and Nathan Glasser",
  title =        "Remote Pipes and Procedures for Efficient Distributed
                 Communication",
  journal =      j-TOCS,
  volume =       "6",
  number =       "3",
  pages =        "258--283",
  month =        aug,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-3/p258-gifford/",
  abstract =     "We describe a new communications model for distributed
                 systems that combines the advantages of remote
                 procedure call with the efficient transfer of bulk
                 data. Three ideas form the basis of this model. First,
                 remote procedures are first-class values which can be
                 freely exchanged among nodes, thus enabling a greater
                 variety of protocols to be directly implemented in a
                 remote procedure call framework. Second, a new type of
                 abstract object, called a pipe, allows bulk data and
                 incremental results to be efficiently transported in a
                 type-safe manner. Third, the relative sequencing of
                 pipes and procedures can be controlled by combining
                 them into channel groups. Calls on the members of a
                 channel group are guaranteed to be processed in order.
                 Application experience with this model, which we call
                 the Channel Model, is reported. Derived performance
                 bounds and experimental measures are presented.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "MIT, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "bulk data transfer; channel model; computer systems,
                 digital; data transmission; design; Distributed;
                 performance; performance bounds; remote procedure
                 call",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, Distributed
                 applications. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS, Performance attributes.",
}

@Article{Johnson:1988:SSR,
  author =       "Dale M. Johnson and F. Javier Thayer",
  title =        "Stating Security Requirements with Tolerable Sets",
  journal =      j-TOCS,
  volume =       "6",
  number =       "3",
  pages =        "284--295",
  month =        aug,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-3/p284-johnson/",
  abstract =     "This paper introduces and develops the concept of
                 tolerable sets for analyzing general security
                 requirements. Tolerable sets, and corresponding purging
                 functions and invisibility based on the sets, are used
                 to state and test such requirements. Some particular
                 applications are described, and some critical remarks
                 about purging functions are included.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "MITRE Corp, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "computer security requirements; computer systems,
                 digital; data processing; purging functions; security;
                 Security of Data; tolerable sets; verification",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Information flow controls. {\bf D.4.6}
                 Software, OPERATING SYSTEMS, Security and Protection,
                 Security kernels**. {\bf D.4.6} Software, OPERATING
                 SYSTEMS, Security and Protection, Verification**. {\bf
                 F.3.1} Theory of Computation, LOGICS AND MEANINGS OF
                 PROGRAMS, Specifying and Verifying and Reasoning about
                 Programs, Specification techniques.",
}

@Article{Colwell:1988:PEA,
  author =       "Robert P. Colwell and Edward F. Gehringer and E.
                 Douglas Jensen",
  title =        "Performance Effects of Architectural Complexity in the
                 {Intel 432}",
  journal =      j-TOCS,
  volume =       "6",
  number =       "3",
  pages =        "296--339",
  month =        aug,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-3/p296-colwell/",
  abstract =     "The Intel 432 is noteworthy as an architecture
                 incorporating a large amount of functionality that most
                 other systems perform by software. This paper examines
                 the performance impact of the incorporation of several
                 kinds of functionality. Among these are the addressing
                 structure, the caches, instruction alignment, the
                 buses, and the way that garbage collection is handled.
                 A set of several benchmarks is used to quantify the
                 performance effect of each of these decisions. The
                 results indicate that the 432 could have been speeded
                 up very significantly if a small number of
                 implementation decisions had been made differently, and
                 if incrementally better technology had been used in its
                 construction.",
  acknowledgement = ack-nhfb,
  affiliationaddress = "Multiflow Computer Inc",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "architectural complexity; computer programming;
                 computer systems, digital --- Parallel Processing;
                 design; Intel 432; measurement; object-based
                 programming environment; performance; Performance;
                 security",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Design studies. {\bf B.5.m} Hardware,
                 REGISTER-TRANSFER-LEVEL IMPLEMENTATION, Miscellaneous.
                 {\bf C.1.1} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Single Data Stream Architectures,
                 Single-instruction-stream, single-data-stream
                 processors (SISD)**. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors),
                 Multiple-instruction-stream, multiple-data-stream
                 processors (MIMD). {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf C.1.3} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Other
                 Architecture Styles, Capability architectures**. {\bf
                 C.1.3} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Other Architecture Styles, High-level
                 language architectures**. {\bf C.1.3} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Other
                 Architecture Styles, Stack-oriented processors**. {\bf
                 D.3.4} Software, PROGRAMMING LANGUAGES, Processors,
                 Compilers.",
}

@Article{Peterson:1988:PNS,
  author =       "Larry L. Peterson",
  title =        "The {Profile} Naming Service",
  journal =      j-TOCS,
  volume =       "6",
  number =       "4",
  pages =        "341--364",
  month =        nov,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-4/p341-peterson/",
  abstract =     "Profile is a descriptive naming service used to
                 identify users and organizations. This paper presents a
                 structural overview of Profile's three major
                 components: a confederation of attribute-based name
                 servers, a name space abstraction that unifies the name
                 servers, and a user interface that integrates the name
                 space with existing naming systems. Each name server is
                 an independent authority that allows clients to
                 describe users and organizations with a multiplicity of
                 attributes; the name space abstraction is a client
                 program that implements a discipline for searching a
                 sequence of name servers; and the interface provides a
                 tool with which users build customized commands.
                 Experience with an implementation in the DARPA\slash
                 NSF Internet demonstrates that Profile is a feasible
                 and effective mechanism for naming users and
                 organizations in a large internet.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Arizona",
  affiliationaddress = "Tucson, AZ, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Attribute-Based Name Servers; Computer Networks;
                 Computer Programming --- Algorithms; Computer Systems,
                 Digital; DARPA-NSF Internet; Database Systems ---
                 Distributed; design; Distributed; human factors; Name
                 Space Abstraction; Naming Service; Profile; User
                 Interface",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed databases. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Distributed databases.
                 {\bf H.3.4} Information Systems, INFORMATION STORAGE
                 AND RETRIEVAL, Systems and Software, Question-answering
                 (fact retrieval) systems**. {\bf H.3.3} Information
                 Systems, INFORMATION STORAGE AND RETRIEVAL, Information
                 Search and Retrieval, Search process.",
}

@Article{Atkins:1988:ESD,
  author =       "M. Stella Atkins",
  title =        "Experiments in {SR} with Different Upcall Program
                 Structures",
  journal =      j-TOCS,
  volume =       "6",
  number =       "4",
  pages =        "365--392",
  month =        nov,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-4/p365-atkins/",
  abstract =     "This paper explores program designs for layered
                 systems such as communication protocols and
                 server\slash client systems that do not exhibit a
                 strict hierarchy in their control flow. D. D. Clark
                 (1985) has proposed structuring such systems, where
                 both upward and downward control flow are required, to
                 use efficient synchronous procedure calls between the
                 layers whenever possible. The term upcall is used by
                 Clark to describe this synchronous upward communication
                 from server to client. Several techniques are possible
                 for structuring such programs using upcalls.
                 Comparisons are made by implementing a communication
                 protocol described by Clark in three different ways.
                 The first method implements all the protocol routines
                 in a single large module. The second method structures
                 the routines into modules occupying vertical slices of
                 the protocol layers, and the third method structures
                 the routines into modules corresponding to the protocol
                 layers. It is concluded that the vertically layered
                 protocol design is to be preferred unless there are
                 many shared variables between the send-side and
                 receive-side, as it is very efficient and provides the
                 best protection of clients from each other. The
                 horizontally layered design is the least efficient, but
                 it is the easiest to program.",
  acknowledgement = ack-nhfb,
  affiliation =  "Simon Fraser Univ",
  affiliationaddress = "Burnaby, BC, Can",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Computer Networks --- Protocols; Computer Programming
                 --- Algorithms; Computer Software; design; Design;
                 languages; Layered Systems; performance; Server/Client
                 Systems; Upcall Program Structures",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Concurrent programming structures. {\bf
                 D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications, SR. {\bf C.2.2} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Protocols. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Buffering. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Hierarchical design**. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Simulation.",
}

@Article{Agarwal:1988:CPO,
  author =       "Anant Agarwal and John Hennessy and Mark Horowitz",
  title =        "Cache Performance of Operating System and
                 Multiprogramming Workloads",
  journal =      j-TOCS,
  volume =       "6",
  number =       "4",
  pages =        "393--431",
  month =        nov,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-4/p393-agarwal/",
  abstract =     "Large caches are necessary in current high-performance
                 computer systems to provide the required high memory
                 bandwidth. Because a small decrease in cache
                 performance can result in significant system
                 performance degradation, accurately characterizing the
                 performance of large caches is important. Although
                 measurements on actual systems have shown that
                 operating systems and multiprogramming can affect cache
                 performance, previous studies have not focused on these
                 effects. We have developed a program tracing technique
                 called ATUM (Address Tracing Using Microcode) that
                 captures realistic traces of multitasking workloads
                 including the operating system. Examining cache
                 behavior using these traces from a VAX processor shows
                 that both the operating system and multiprogramming
                 activity significantly degrade cache performance, with
                 an even greater proportional impact on large caches.
                 From a careful analysis of the causes of this
                 degradation, we explore various techniques to reduce
                 this loss. While seemingly little can be done to
                 mitigate the effect of system references, multitasking
                 cache miss activity can be substantially reduced with
                 small hardware additions.",
  acknowledgement = ack-nhfb,
  affiliation =  "Stanford Univ",
  affiliationaddress = "Stanford, CA, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Address Tracing Using Microcode; ATUM; Cache
                 Performance; Computer Operating Systems; Computer
                 Systems Programming --- Multiprogramming; design;
                 measurement; Multiprogramming Workloads; performance;
                 Performance; Program Tracing",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Associative memories. {\bf
                 B.3.2} Hardware, MEMORY STRUCTURES, Design Styles,
                 Virtual memory. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Formal models**. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Simulation**. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS, Design studies. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS,
                 Measurement techniques. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Modeling
                 techniques. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Measurements.",
}

@Article{Okamoto:1988:DMS,
  author =       "Tatsuaki Okamoto",
  title =        "A Digital Multisignature Scheme using Bijective
                 Public-Key Cryptosystems",
  journal =      j-TOCS,
  volume =       "6",
  number =       "4",
  pages =        "432--441",
  month =        nov,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1988-6-4/p432-okamoto/",
  abstract =     "A new digital multisignature scheme using bijective
                 public-key cryptosystems that overcomes the problems of
                 previous signature schemes used for multisignatures is
                 proposed. The principal features of this scheme are (1)
                 the length of a multisignature message is nearly
                 equivalent to that for a single signature message; (2)
                 by using a one-way hash function, multisignature
                 generation and verification are processed in an
                 efficient manner; (3) the order of signing is not
                 restricted; and (4) this scheme can be constructed on
                 any bijective public-key cryptosystem as well as the
                 RSA scheme. In addition, it is shown that the new
                 scheme is considered as safe as the public-key
                 cryptosystem used in this new scheme. Some variations
                 based on the scheme are also presented.",
  acknowledgement = ack-nhfb,
  affiliation =  "NTT",
  affiliationaddress = "Yokosuka, Jpn",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Bijective Public-Key Cryptosystems; Computer-Based
                 Message Systems; Cryptography; Digital Multisignature
                 Scheme; Electronic Mail; One-Way Hash Function;
                 security",
  subject =      "{\bf E.3} Data, DATA ENCRYPTION, Public key
                 cryptosystems.",
}

@Article{Borg:1989:FTU,
  author =       "Anita Borg and Wolfgang Blau and Wolfgang Graetsch and
                 Ferdinand Herrmann and Wolfgang Oberle",
  title =        "Fault Tolerance under {UNIX}",
  journal =      j-TOCS,
  volume =       "7",
  number =       "1",
  pages =        "1--24",
  month =        feb,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-1/p1-borg/",
  abstract =     "The initial design for a distributed, fault-tolerant
                 version of UNIX based on three-way atomic message
                 transmission was presented in an earlier paper. This
                 paper describes the working system, now known as the
                 TARGON\slash 32. The original design left open
                 questions in at least two areas: fault tolerance for
                 server processes and recovery after a crash were
                 briefly and inaccurately sketched; rebackup after
                 recovery was not discussed at all. The fundamental
                 design involving three-way message transmission has
                 remained unchanged. However, server backup has been
                 redesigned and is now more consistent with that of
                 normal user processes. Recovery and rebackup have been
                 completed in a less centralized and thus more efficient
                 manner. We review important aspects of the original
                 design and note how the implementation differs from our
                 original ideas. We then focus on the backup and
                 recovery for server processes and the changes and
                 additions in the design and implementation of recovery
                 and rebackup.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nixdorf Computer GmbH",
  affiliationaddress = "Paderborn, West Ger",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Computer Architecture; Computer Operating
                 Systems; Computer Systems, Digital; Crash Handling;
                 Fault Tolerant Capability; Multiway Message
                 Transmission; reliability; Roll Forward Recovery;
                 Server Architecture; TARGON/32; UNIX",
  subject =      "{\bf D.4.0} Software, OPERATING SYSTEMS, General,
                 UNIX. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability, Backup procedures. {\bf
                 D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Checkpoint/restart. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Associative
                 processors. {\bf D.4.3} Software, OPERATING SYSTEMS,
                 File Systems Management. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management, Message
                 sending.",
}

@Article{Pittelli:1989:RST,
  author =       "Frank M. Pittelli and H{\'e}ctor Garc{\'\i}a-Molina",
  title =        "Reliable Scheduling in a {TMR} Database System",
  journal =      j-TOCS,
  volume =       "7",
  number =       "1",
  pages =        "25--60",
  month =        feb,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-1/p25-pittelli/",
  abstract =     "A Triple Modular Redundant (TMR) system achieves high
                 reliability by replicating data and all processing at
                 three independent nodes. When TMR is used for database
                 processing all nonfaulty computers must execute the
                 same sequence of transactions, and this is ensured by a
                 collection of processes known as schedulers. In this
                 paper we study the implementation of efficient
                 schedulers through analysis of various enhancements
                 such as null transactions and message batching. The
                 schedulers have been implemented in an experimental TMR
                 system and the evaluation results are presented here.",
  acknowledgement = ack-nhfb,
  affiliation =  "US Naval Acad",
  affiliationaddress = "USA",
  classification = "723; 913",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Database Systems; design; Distributed;
                 Message Batching; Null Transactions; performance;
                 reliability; Reliability; Reliable Scheduling;
                 Scheduling; Transaction Processing; Triple Modular
                 Redundancy",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Reliability, availability, and
                 serviceability. {\bf H.2.0} Information Systems,
                 DATABASE MANAGEMENT, General. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Scheduling.",
}

@Article{Raymond:1989:TBA,
  author =       "Kerry Raymond",
  title =        "A Tree-Based Algorithm for Distributed Mutual
                 Exclusion",
  journal =      j-TOCS,
  volume =       "7",
  number =       "1",
  pages =        "61--77",
  month =        feb,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-1/p61-raymond/",
  abstract =     "We present an algorithm for distributed mutual
                 exclusion in a computer network of N nodes that
                 communicate by messages rather than shared memory. The
                 algorithm uses a spanning tree of the computer network,
                 and the number of messages exchanged per critical
                 section depends on the topology of this tree. However,
                 typically the number of messages exchanged is O(log N)
                 under light demand, and reduces to approximately four
                 messages under saturated demand. Each node holds
                 information only about its immediate neighbors in the
                 spanning tree rather than information about all nodes,
                 and failed nodes can recover necessary information from
                 their neighbors. The algorithm does not require
                 sequence numbers as it operates correctly despite
                 message overtaking.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Queensland",
  affiliationaddress = "St. Lucia, Aust",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Computer Networks; Computer
                 Programming--Algorithms; Computer Systems, Digital;
                 design; Distributed; Distributed Mutual Exclusion;
                 Mathematical Techniques--Trees; Message Passing; Tree
                 Based Algorithms",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Mutual exclusion. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Synchronization.
                 {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf D.4.4} Software, OPERATING SYSTEMS, Communications
                 Management, Message sending.",
}

@Article{Thompson:1989:ESA,
  author =       "James G. Thompson and Alan Jay Smith",
  title =        "Efficient (Stack) Algorithms for Analysis of
                 Write-Back and Sector Memories",
  journal =      j-TOCS,
  volume =       "7",
  number =       "1",
  pages =        "78--117",
  month =        feb,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-1/p78-thompson/",
  abstract =     "For the class of replacement algorithms known as stack
                 algorithms, existing analysis techniques permit the
                 computation of memory miss ratios for all memory sizes
                 simultaneously in one pass over a memory reference
                 string. We extend the class of computations possible by
                 this methodology in two ways. First, we show how to
                 compute the effects of copy-backs in write-back caches.
                 The key observation here is that a given block is clean
                 for all memory sizes less than or equal to C blocks and
                 is dirty for all larger memory sizes. Our technique
                 permits efficient computations for algorithms or
                 systems using periodic write-back and\slash or block
                 deletion. The second extension permits stack analysis
                 simulation for sector (or subblock) caches in which a
                 sector (associated with an address tag) consists of
                 subsectors (or subblocks) that can be loaded
                 independently. The key observation here is that a
                 subsector is present only in caches of size C or
                 greater. Load forward prefetching in a sector cache is
                 shown to be a stack algorithm and is easily simulated
                 using our technique. Running times for our methods are
                 only slightly higher than for a simulation of a single
                 memory size using nonstack techniques.",
  acknowledgement = ack-nhfb,
  affiliation =  "US Air Force",
  affiliationaddress = "USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Cache Memories; Computer
                 Programming--Algorithms; Data Storage, Digital; design;
                 experimentation; measurement; Memory System
                 Performance; performance; Performance; Replacement
                 Algorithms; Sector Memories; Stack Algorithms; theory;
                 Write Back Memories",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles. {\bf B.6.1} Hardware, LOGIC DESIGN, Design
                 Styles, Memory control and access**. {\bf B.3.3}
                 Hardware, MEMORY STRUCTURES, Performance Analysis and
                 Design Aids**, Simulation**. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance.",
}

@Article{Gupta:1989:HSI,
  author =       "Anoop Gupta and Charles Forgy and Allen Newell",
  title =        "High-speed Implementations of Rule-Based Systems",
  journal =      j-TOCS,
  volume =       "7",
  number =       "2",
  pages =        "119--146",
  month =        may,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-2/p119-gupta/",
  abstract =     "We explore various methods for speeding up the
                 execution of rule-based systems. In particular, we
                 examine the role of parallelism in the high-speed
                 execution of rule-based systems and study the
                 architectural issues in the design of computers for
                 rule-based systems. Our results show that contrary to
                 initial expectations, the speedup that can be obtained
                 from parallelism is quite limited, only about tenfold.
                 The reasons for the small speed-up are: (1) the small
                 number of rules relevant to each change to data memory;
                 (2) the large variation in the processing requirements
                 of relevant rules; and (3) the small number of changes
                 made to data memory between synchronization steps.
                 Furthermore, we observe that to obtain this limited
                 factor of tenfold speed-up, it is necessary to exploit
                 parallelism at a very fine granularity. We propose that
                 a suitable architecture to exploit such fine-grain
                 parallelism is a shared-memory multiprocessor with
                 32-64 processors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Stanford Univ",
  affiliationaddress = "Stanford, CA, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Artificial Intelligence; Computer
                 Architecture; Computer Systems, Digital--Parallel
                 Processing; design; languages; performance; Production
                 Systems; Rule Based Systems; Shared Memory
                 Multiprocessors; Speedup",
  subject =      "{\bf I.2.5} Computing Methodologies, ARTIFICIAL
                 INTELLIGENCE, Programming Languages and Software. {\bf
                 I.2.5} Computing Methodologies, ARTIFICIAL
                 INTELLIGENCE, Programming Languages and Software, OPS5.
                 {\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors),
                 Multiple-instruction-stream, multiple-data-stream
                 processors (MIMD). {\bf G.1.0} Mathematics of
                 Computing, NUMERICAL ANALYSIS, General, Parallel
                 algorithms.",
}

@Article{Cheriton:1989:DGN,
  author =       "David R. Cheriton and Timothy P. Mann",
  title =        "Decentralizing a Global Naming Service for Improved
                 Performance and Fault Tolerance",
  journal =      j-TOCS,
  volume =       "7",
  number =       "2",
  pages =        "147--183",
  month =        may,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-2/p147-cheriton/",
  abstract =     "We address the problem of a global naming system,
                 proposing a three-level naming architecture that
                 consists of global, administrational, and managerial
                 naming mechanisms, each optimized to meet the
                 performance, reliability, and security requirements at
                 its own level. We focus in particular on a
                 decentralized approach to the lower levels, in which
                 naming is handled directly by the managers of the named
                 objects. Client-name caching and multicast are
                 exploited to implement name mapping with almost optimum
                 performance and fault tolerance. We also show how the
                 naming system can be made secure. Our conclusions are
                 bolstered by experience with an implementation in the V
                 distributed operating system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Stanford Univ",
  affiliationaddress = "Stanford, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Computer Fault Tolerance; Computer Operating Systems;
                 Computer Systems, Digital; design; Distributed;
                 Distributed File Systems; experimentation; Global
                 Naming Service; measurement; performance; reliability",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Distributed file systems. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection.",
}

@Article{Agarwal:1989:ACM,
  author =       "Anant Agarwal and Mark Horowitz and John Hennessy",
  title =        "An Analytical Cache Model",
  journal =      j-TOCS,
  volume =       "7",
  number =       "2",
  pages =        "184--215",
  month =        may,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-2/p184-agarwal/",
  abstract =     "Trace-driven simulation and hardware measurement are
                 the techniques most often used to obtain accurate
                 performance figures for caches. The former requires a
                 large amount of simulation time to evaluate each cache
                 configuration while the latter is restricted to
                 measurements of existing caches. An analytical cache
                 model that uses parameters extracted from address
                 traces of programs can efficiently provide estimates of
                 cache performance and show the effects of varying cache
                 parameters. By representing the factors that affect
                 cache performance, we develop an analytical model that
                 gives miss rates for a given trace as a function of
                 cache size, degree of associativity, block size,
                 subblock size, multiprogramming level, task switch
                 interval, and observation interval. The predicted
                 values closely approximate the results of trace-driven
                 simulations, while requiring only a small fraction of
                 the computation cost.",
  acknowledgement = ack-nhfb,
  affiliation =  "Stanford Univ",
  affiliationaddress = "Stanford, CA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Cache Miss Rate; Cache Models; Computer Architecture;
                 Data Storage Units; design; measurement; Memory
                 Structures; performance; theory; Trace Driven
                 Simulation",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Formal models**. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Simulation**. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management,
                 Multiprocessing/multiprogramming/multitasking.",
}

@Article{Peterson:1989:PUC,
  author =       "Larry L. Peterson and Nick C. Buchholz and Richard D.
                 Schlichting",
  title =        "Preserving and Using Context Information in
                 Interprocess Communication",
  journal =      j-TOCS,
  volume =       "7",
  number =       "3",
  pages =        "217--246",
  month =        aug,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-3/p217-peterson/",
  abstract =     "When processes in a network communicate, the messages
                 they exchange define a partial ordering of externally
                 visible events. While the significance of this partial
                 order in distributed computing is well understood, it
                 has not been made an explicit part of the communication
                 substrate upon which distributed programs are
                 implemented. This paper describes a new interprocess
                 communication mechanism, called Psync, that explicitly
                 encodes this partial ordering with each message. The
                 paper shows how Psync can be efficiently implemented on
                 an unreliable communications network, and it
                 demonstrates how conversations serve as an elegant
                 foundation for ordering messages exchanged in a
                 distributed computation and for recovering from
                 processor failures.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Arizona",
  affiliationaddress = "Tucson, AZ, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Computer Fault Tolerance; Computer
                 Programming--Algorithms; Computer Systems, Digital;
                 Context Information; Database Systems--Distributed;
                 design; Distributed; Interprocess Communication;
                 Partial Ordering; performance; Psync Protocol;
                 reliability",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Psync. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf D.4.4} Software, OPERATING SYSTEMS, Communications
                 Management, Network communication. {\bf C.4} Computer
                 Systems Organization, PERFORMANCE OF SYSTEMS, Design
                 studies.",
}

@Article{Satyanarayanan:1989:ISL,
  author =       "M. Satyanarayanan",
  title =        "Integrating Security in a Large Distributed System",
  journal =      j-TOCS,
  volume =       "7",
  number =       "3",
  pages =        "247--280",
  month =        aug,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-3/p247-satyanarayanan/",
  abstract =     "Andrew is a distributed computing environment that is
                 a synthesis of the personal computing and timesharing
                 paradigms. When mature, it is expected to encompass
                 over 5,000 workstations spanning the Carnegie Mellon
                 University campus. This paper examines the security
                 issues that arise in such an environment and describes
                 the mechanisms that have been developed to address
                 them. These mechanisms include the logical and physical
                 separation of servers and clients, support for secure
                 communication at the remote procedure call level, a
                 distributed authentication service, a file-protection
                 scheme that combines access lists with UNIX mode bits,
                 and the use of encryption as a basic building block.
                 The paper also discusses the assumptions underlying
                 security in Andrew and analyzes the vulnerability of
                 the system. Usage experience reveals that resource
                 control, particularly of workstation CPU cycles, is
                 more important than originally anticipated and that the
                 mechanisms available to address this issue are
                 rudimentary.",
  acknowledgement = ack-nhfb,
  affiliation =  "Carnegie Mellon Univ",
  affiliationaddress = "Pittsburgh, PA, USA",
  classification = "722; 723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "algorithms; Andrew Distributed Computing Environment;
                 Computer Security; Computer Systems, Digital;
                 Computers, Personal; Cryptography; design; Distributed;
                 security; Time Sharing",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection. {\bf C.0} Computer Systems Organization,
                 GENERAL, Andrew. {\bf D.4.3} Software, OPERATING
                 SYSTEMS, File Systems Management, Distributed file
                 systems. {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf E.3} Data, DATA ENCRYPTION, Data encryption
                 standard (DES)**.",
}

@Article{Shankar:1989:VDT,
  author =       "A. Udaya Shankar",
  title =        "Verified Data Transfer Protocols with Variable Flow
                 Control",
  journal =      j-TOCS,
  volume =       "7",
  number =       "3",
  pages =        "281--316",
  month =        aug,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-3/p281-shankar/",
  abstract =     "We present and verify a sliding window protocol which
                 uses modulo-N sequence numbers to achieve reliable
                 flow-controlled data transfer between a producer and a
                 consumer connected by unreliable channels. The
                 consumer's data needs are represented by a receive
                 window whose size can vary with time. The producer
                 entity sends segments of data words that lie within the
                 consumer's receive window. The consumer entity sends
                 acknowledgement, selective acknowledgement, and
                 selective reject messages that inform the producer
                 entity of the current receive window size, the data
                 word next expected, and the reception (or lack of
                 reception) of out-of-sequence data segments. Our
                 protocol is, therefore, a proper extension of existing
                 transport and data link protocol standards such as TCP,
                 ISO TP, HDLC, ADCCP, and so forth.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Maryland",
  affiliationaddress = "College Park, MD, USA",
  classification = "723",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  journalabr =   "ACM Trans Comput Syst",
  keywords =     "Computer Networks--Protocols; Data Transfer Protocols;
                 Data Transmission; design; Reliability; Sliding Window
                 Protocol; theory; Variable Flow Control; verification",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol verification. {\bf C.3} Computer Systems
                 Organization, SPECIAL-PURPOSE AND APPLICATION-BASED
                 SYSTEMS, Real-time and embedded systems. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Modeling and
                 prediction. {\bf F.3.1} Theory of Computation, LOGICS
                 AND MEANINGS OF PROGRAMS, Specifying and Verifying and
                 Reasoning about Programs. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management.",
}

@Article{Li:1989:MCS,
  author =       "Kai Li and Paul Hudak",
  title =        "Memory Coherence in Shared Virtual Memory Systems",
  journal =      j-TOCS,
  volume =       "7",
  number =       "4",
  pages =        "321--359",
  month =        nov,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-4/p321-li/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; experimentation; measurement;
                 performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Shared memory. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Interconnection
                 architectures. {\bf B.3.2} Hardware, MEMORY STRUCTURES,
                 Design Styles, Virtual memory. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications.",
}

@Article{Ng:1989:UHI,
  author =       "Tony P. Ng",
  title =        "Using Histories to Implement Atomic Objects",
  journal =      j-TOCS,
  volume =       "7",
  number =       "4",
  pages =        "360--393",
  month =        nov,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-4/p360-ng/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design",
  subject =      "{\bf H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Concurrency. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Distributed databases.
                 {\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Concurrency.
                 {\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Distributed systems.",
}

@Article{Barbara:1989:IAU,
  author =       "Daniel Barbara and H{\'e}ctor Garc{\'\i}a-Molina and
                 Annemarie Spauster",
  title =        "Increasing Availability under Mutual Exclusion
                 Constraints with Dynamic Vote Reassignment",
  journal =      j-TOCS,
  volume =       "7",
  number =       "4",
  pages =        "394--426",
  month =        nov,
  year =         "1989",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1989-7-4/p394-barbara/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance; reliability",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Mutual
                 exclusion. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability.",
}

@Article{Schroeder:1990:PFR,
  author =       "Michael D. Schroeder and Michael Burrows",
  title =        "Performance of the {Firefly RPC}",
  journal =      j-TOCS,
  volume =       "8",
  number =       "1",
  pages =        "1--17",
  month =        feb,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-1/p1-schroeder/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors). {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS,
                 Measurement techniques. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements.",
}

@Article{Burrows:1990:LA,
  author =       "Michael Burrows and Martin Abadi and Roger Needham",
  title =        "A Logic of Authentication",
  journal =      j-TOCS,
  volume =       "8",
  number =       "1",
  pages =        "18--36",
  month =        feb,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-1/p18-burrows/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "security; theory; verification",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol verification. {\bf C.2.0} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, General,
                 Security and protection (e.g., firewalls). {\bf D.4.6}
                 Software, OPERATING SYSTEMS, Security and Protection,
                 Authentication. {\bf F.3.1} Theory of Computation,
                 LOGICS AND MEANINGS OF PROGRAMS, Specifying and
                 Verifying and Reasoning about Programs. {\bf D.4.6}
                 Software, OPERATING SYSTEMS, Security and Protection,
                 Cryptographic controls.",
}

@Article{Bershad:1990:LRP,
  author =       "Brian N. Bershad and Thomas E. Anderson and Edward D.
                 Lazowska and Henry M. Levy",
  title =        "Lightweight Remote Procedure Call",
  journal =      j-TOCS,
  volume =       "8",
  number =       "1",
  pages =        "37--55",
  month =        feb,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-1/p37-bershad/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance; security",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf C.1.3} Computer Systems Organization,
                 PROCESSOR ARCHITECTURES, Other Architecture Styles,
                 Capability architectures**. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection, Security
                 kernels**. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements.",
}

@Article{Anderson:1990:SCM,
  author =       "David P. Anderson and Ron Kuivila",
  title =        "A System for Computer Music Performance",
  journal =      j-TOCS,
  volume =       "8",
  number =       "1",
  pages =        "56--82",
  month =        feb,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-1/p56-anderson/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; experimentation; human factors;
                 languages; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.7} Software, OPERATING
                 SYSTEMS, Organization and Design, Interactive systems.
                 {\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Real-time systems and embedded systems.
                 {\bf D.4.4} Software, OPERATING SYSTEMS, Communications
                 Management, Input/output.",
}

@Article{Deering:1990:MRD,
  author =       "Stephen E. Deering and David R. Cheriton",
  title =        "Multicast Routing in Datagram Internetworks and
                 Extended {LANs}",
  journal =      j-TOCS,
  volume =       "8",
  number =       "2",
  pages =        "85--110",
  month =        may,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-2/p85-deering/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.5}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Local and Wide-Area Networks. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols, Protocol architecture.
                 {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.",
}

@Article{Schwan:1990:TDO,
  author =       "Karsten Schwan and Win Bo",
  title =        "``Topologies'' --- Distributed Objects on
                 Multicomputers",
  journal =      j-TOCS,
  volume =       "8",
  number =       "2",
  pages =        "111--157",
  month =        may,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-2/p111-schwan/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming, Parallel programming. {\bf
                 G.1.0} Mathematics of Computing, NUMERICAL ANALYSIS,
                 General, Parallel algorithms. {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Message sending.",
}

@Article{Ramakrishnan:1990:BFS,
  author =       "K. K. Ramakrishnan and R. Jain",
  title =        "A Binary Feedback Scheme for Congestion Avoidance in
                 Computer Networks",
  journal =      j-TOCS,
  volume =       "8",
  number =       "2",
  pages =        "158--181",
  month =        may,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-2/p158-ramakrishnan/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.3}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Operations, Network monitoring. {\bf
                 C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols.",
}

@Article{Benson:1990:FPM,
  author =       "Glenn S. Benson and Ian F. Akyildiz and William F.
                 Aelbe",
  title =        "A Formal Protection Model of Security in Centralized,
                 Parallel, and Distributed Systems",
  journal =      j-TOCS,
  volume =       "8",
  number =       "3",
  pages =        "183--213",
  month =        aug,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-3/p183-benson/",
  abstract =     "One way to show that a system is not secure is to
                 demonstrate that a malicious or mistake-prone user or
                 program can break security by causing the system to
                 reach a nonsecure state. A fundamental aspect of a
                 security model is a proof that validates that every
                 state reachable from a secure initial state is secure.
                 A sequential security model assumes that every command
                 that acts as a state transition executes sequentially,
                 while a concurrent security model assumes that multiple
                 commands execute concurrently. This paper presents a
                 security model called the
                 Centralized-Parallel-Distributed model (CPD model) that
                 defines security for logically, or physically
                 centralized, parallel, and distributed systems. The
                 purpose of the CPD model is to define concurrency
                 conditions that guarantee that a concurrent system
                 cannot reach a state in which privileges are configured
                 in a nonsecure manner. As an example, the conditions
                 are used to construct a representation of a distributed
                 system.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; security; theory; verification",
  subject =      "{\bf C.2.0} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, General, Security and
                 protection (e.g., firewalls). {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Scheduling. {\bf F.3.1} Theory of Computation, LOGICS
                 AND MEANINGS OF PROGRAMS, Specifying and Verifying and
                 Reasoning about Programs. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection, Access
                 controls.",
}

@Article{King:1990:DAM,
  author =       "Richard P. King",
  title =        "Disk Arm Movement in Anticipation of Future Requests",
  journal =      j-TOCS,
  volume =       "8",
  number =       "3",
  pages =        "214--229",
  month =        aug,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-3/p214-king/",
  abstract =     "When a disk drive's access arm is idle, it may not be
                 at the ideal location. In anticipation of future
                 requests, movement to some other location may be
                 advantageous. The effectiveness of anticipatory disk
                 arm movement is explored. Various operating conditions
                 are considered, and the reduction in seek distances and
                 request response times is determined for them. Suppose
                 that successive requests are independent and uniformly
                 distributed. By bringing the arm to the middle of its
                 range of motion when it is idle, the expected seek
                 distance can be reduced by 25 percent. Nonlinearity in
                 time versus distance can whittle that 25 percent
                 reduction down to a 13 percent reduction in seek time.
                 Nonuniformity in request location, nonPoisson arrival
                 processes, and high arrival rates can whittle the
                 reduction down to nothing. However, techniques are
                 discussed that maximize those savings that are still
                 possible under those circumstances. Various systems
                 with multiple arms are analyzed. Usually, it is best to
                 spread out the arms over the disk area. The both arms
                 should be brought to the middle.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Secondary storage. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Modeling and
                 prediction. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Simulation.",
}

@Article{Mitchell:1990:EPA,
  author =       "Chad L. Mitchell and Michael J. Flynn",
  title =        "The Effects of Processor Architecture on Instruction
                 Memory Traffic",
  journal =      j-TOCS,
  volume =       "8",
  number =       "3",
  pages =        "230--250",
  month =        aug,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-3/p230-mitchell/",
  abstract =     "The relative amount of instruction traffic for two
                 architectures is about the same in the presence of a
                 large cache as with no cache. Furthermore, the presence
                 of an intermediate-sized cache probably substantially
                 favors the denser architecture. Encoding techniques
                 have a much greater impact on instruction traffic than
                 do the differences between instruction set families
                 such as stack and register set. However, register set
                 architectures have somewhat lower instruction traffic
                 than directly comparable stack architectures of some
                 local variables are allocated in registers. This study
                 has clearly indicated that cache factors should be
                 taken into consideration when making architectural
                 tradeoffs. The differences in memory traffic between
                 two architectures may be greatly amplified in the
                 presence of a cache.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf C.0} Computer Systems
                 Organization, GENERAL, Instruction set design. {\bf
                 C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Performance attributes. {\bf B.3.3} Hardware,
                 MEMORY STRUCTURES, Performance Analysis and Design
                 Aids**, Simulation**.",
}

@Article{Gotzhein:1990:DPS,
  author =       "Reinhard Gotzhein and Gregor von Bochmann",
  title =        "Deriving Protocol Specifications from Service
                 Specifications Including Parameters",
  journal =      j-TOCS,
  volume =       "8",
  number =       "4",
  pages =        "255--283",
  month =        nov,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-4/p255-gotzhein/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; verification",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems.",
}

@Article{Marzullo:1990:TFC,
  author =       "Keith Marzullo",
  title =        "Tolerating Failures of Continuous-Valued Sensors",
  journal =      j-TOCS,
  volume =       "8",
  number =       "4",
  pages =        "284--304",
  month =        nov,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-4/p284-marzullo/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS, Process
                 control systems. {\bf F.3.1} Theory of Computation,
                 LOGICS AND MEANINGS OF PROGRAMS, Specifying and
                 Verifying and Reasoning about Programs.",
}

@Article{Lamport:1990:CRW,
  author =       "Leslie Lamport",
  title =        "Concurrent Reading and Writing of Clocks",
  journal =      j-TOCS,
  volume =       "8",
  number =       "4",
  pages =        "305--310",
  month =        nov,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-4/p305-lamport/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; verification",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Concurrency. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf
                 D.2.4} Software, SOFTWARE ENGINEERING, Software/Program
                 Verification. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management.",
}

@Article{Goldszmidt:1990:HLL,
  author =       "German S. Goldszmidt and Shaula Yemini",
  title =        "High-level Language Debugging for Concurrent
                 Programs",
  journal =      j-TOCS,
  volume =       "8",
  number =       "4",
  pages =        "311--336",
  month =        nov,
  year =         "1990",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1990-8-4/p311-goldszmidt/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "experimentation; verification",
  subject =      "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
                 and Debugging, Debugging aids. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming.",
}

@Article{Agrawal:1991:EFT,
  author =       "Divyakant Agrawal and Amr {El Abbadi}",
  title =        "An Efficient and Fault-Tolerant Solution for
                 Distributed Mutual Exclusion",
  journal =      j-TOCS,
  volume =       "9",
  number =       "1",
  pages =        "1--20",
  month =        feb,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-1/p1-agrawal/",
  abstract =     "In this paper, we present an efficient and
                 fault-tolerant algorithm for generating quorums to
                 solve the distributed mutual exclusion problem. The
                 algorithm uses a logical tree organization of the
                 network to generate tree quorums, which are logarithmic
                 in the size of the network in the best case. Our
                 approach is resilient to both site and communication
                 failures, even when such failures lead to network
                 partitioning. Furthermore, the algorithm exhibits a
                 property of graceful degradation, i.e., it requires
                 more messages only as the number of failures increase
                 in the network. We describe how tree quorums can be
                 used for various distributed applications for providing
                 mutually exclusive access to a distributed resource,
                 managing replicated objects, and atomically committing
                 a distributed transaction.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; reliability",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Mutual exclusion. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability, Fault-tolerance. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design, Distributed systems. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems.",
}

@Article{Mellor-Crummey:1991:ASS,
  author =       "John M. Mellor-Crummey and Michael L. Scott",
  title =        "Algorithms for Scalable Synchronization on
                 Shared-Memory Multiprocessors",
  journal =      j-TOCS,
  volume =       "9",
  number =       "1",
  pages =        "21--65",
  month =        feb,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-1/p21-mellor-crummey/",
  abstract =     "Busy-wait techniques are heavily used for mutual
                 exclusion and barrier synchronization in shared-memory
                 parallel programs. Unfortunately, typical
                 implementations of busy-waiting tend to produce large
                 amounts of memory and interconnect contention,
                 introducing performance bottlenecks that become
                 markedly more pronounced as applications scale. We
                 argue that this problem is not fundamental, and that
                 one can in fact construct busy-wait synchronization
                 algorithms that induce no memory or interconnect
                 contention. The key to these algorithms is for every
                 processor to spin on separate locally-accessible flag
                 variables, and for some other processor to terminate
                 the spin with a single remote write operation at an
                 appropriate time. Flag variables may be
                 locally-accessible as a result of coherent caching, or
                 by virtue of allocation in the local portion of
                 physically distributed shared memory. We present a new
                 scalable algorithm for spin locks that generates 0(1)
                 remote references per lock acquisition, independent of
                 the number of processors attempting to acquire the
                 lock. Our algorithm provides reasonable latency in the
                 absence of contention, requires only a constant amount
                 of space per lock, and requires no hardware support
                 other than a swap-with-memory instruction. We also
                 present a new scalable barrier algorithm that generates
                 0(1) remote references per processor reaching the
                 barrier, and observe that two previously-known barriers
                 can likewise be cast in a form that spins only on
                 locally-accessible flag variables. None of these
                 barrier algorithms requires hardware support beyond the
                 usual atomicity of memory reads and writes. We compare
                 the performance of our scalable algorithms with other
                 software approaches to busy-wait synchronization on
                 both a Sequent Symmetry and a BBN Butterfly. Our
                 principal conclusion is that contention due to
                 synchronization need not be a problem in large-scale
                 shared-memory multiprocessors. The existence of
                 scalable algorithms greatly weakens the case for costly
                 special-purpose hardware support for synchronization,
                 and provides a case against so-called ``dance hall''
                 architectures, in which shared memory locations are
                 equally far from all processors. ---From the Authors'
                 Abstract",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf B.3.2} Hardware,
                 MEMORY STRUCTURES, Design Styles, Shared memory. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Mutual exclusion. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Interconnection
                 architectures. {\bf D.4.2} Software, OPERATING SYSTEMS,
                 Storage Management, Storage hierarchies. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Measurements.
                 {\bf C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Design studies.",
}

@Article{Huguet:1991:ASR,
  author =       "Miquel Huguet and Tom{\'a}s Lang",
  title =        "Architectural Support for Reduced Register
                 Saving\slash Restoring in Single-Window Register
                 Files",
  journal =      j-TOCS,
  volume =       "9",
  number =       "1",
  pages =        "66--97",
  month =        feb,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-1/p66-huguet/",
  abstract =     "The use of registers in a processor reduces the data
                 and instruction memory traffic. Since this reduction is
                 a significant factor in the improvement of the program
                 execution time, recent VLSI processors have a large
                 number of registers which can be used efficiently
                 because of the advances in compiler technology.
                 However, since registers have to be saved/restored
                 across function calls, the corresponding register
                 saving and restoring (RSR) memory traffic can almost
                 eliminate the overall reduction. This traffic has been
                 reduced by compiler optimizations and by providing
                 multiple-window register files. Although these
                 multiple-window architectures produce a large reduction
                 in the RSR traffic, they have several drawbacks which
                 make the single-window file preferable. We consider a
                 combination of {\em hardware support\/} and {\em
                 compiler optimizations\/} to reduce the RSR traffic for
                 a single-window register file, beyond the reductions
                 achieved by compiler optimizations alone. Basically,
                 this hardware keeps track of the registers that are
                 written during execution, so that the number of
                 registers saved is minimized. Moreover, hardware is
                 added so that a register is saved in the activation
                 record of the function that uses it (instead of in the
                 record of the current function); in this way a register
                 is restored only when it is needed, rather than
                 wholesale on procedure return. We present a register
                 saving and restoring policy that makes use of this
                 hardware, discuss its implementation, and evaluate the
                 traffic reduction when the policy is combined with
                 intraprocedural and interprocedural compiler
                 optimizations. We show that, on the average for the
                 four general-purpose programs measured, the RSR traffic
                 is reduced by about 90 percent for a small register
                 file (i.e., 32 registers), which results in an overall
                 data memory traffic reduction of about 15 percent.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; languages; performance",
  subject =      "{\bf B.5.2} Hardware, REGISTER-TRANSFER-LEVEL
                 IMPLEMENTATION, Design Aids, Optimization. {\bf B.5.1}
                 Hardware, REGISTER-TRANSFER-LEVEL IMPLEMENTATION,
                 Design, Data-path design. {\bf B.7.1} Hardware,
                 INTEGRATED CIRCUITS, Types and Design Styles, VLSI
                 (very large scale integration). {\bf B.1.4} Hardware,
                 CONTROL STRUCTURES AND MICROPROGRAMMING, Microprogram
                 Design Aids, Languages and compilers.",
}

@Article{Zhang:1991:VNT,
  author =       "Lixia Zhang",
  title =        "{VirtualClock}: a New Traffic Control Algorithm for
                 Packet-Switched Networks",
  journal =      j-TOCS,
  volume =       "9",
  number =       "2",
  pages =        "101--124",
  month =        may,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-2/p101-zhang/",
  abstract =     "One of the challenging research issues in building
                 high-speed packet-switched networks is how to control
                 the transmission rate of statistical data flows. This
                 paper describes a new traffic control algorithm, {\em
                 VirtualClock}, for high-speed network applications.
                 VirtualClock monitors the average transmission rate of
                 statistical data flows and provides every flow with
                 guaranteed throughput and low queueing delay. It
                 provides firewall protection among individual flows, as
                 in a TDM system, while retaining the statistical
                 multiplexing advantages of packet switching. Simulation
                 results show that the VirtualClock algorithm meets all
                 its design goals.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Packet-switching networks. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols, Protocol architecture.
                 {\bf C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Performance attributes.",
}

@Article{Liskov:1991:EMO,
  author =       "Barbara Liskov and Liuba Shrira and John Wroclawski",
  title =        "Efficient At-Most-Once Messages Based on Synchronized
                 Clocks",
  journal =      j-TOCS,
  volume =       "9",
  number =       "2",
  pages =        "125--142",
  month =        may,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-2/p125-liskov/",
  abstract =     "This paper describes a new at-most-once message
                 passing protocol that provides guaranteed detection of
                 duplicate messages even when the receiver has no state
                 stored for the sender. It also discusses how to use
                 at-most-once messages to implement higher-level
                 primitives such as at-once-remote procedure calls and
                 sequenced bytestream protocols. Our performance
                 measurements indicate that at-most-once RPCs can
                 provide at the same cost as less desirable forms of
                 RPCs that do not guarantee at-most-once execution. Our
                 method is based on the assumption that clocks
                 throughout the system are loosely synchronized. Modern
                 clock synchronization protocols provide good bounds on
                 clock skew with high probability; our method depends on
                 the bound for performance but not for correctness.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management, Message sending.
                 {\bf C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Performance attributes. {\bf C.4} Computer
                 Systems Organization, PERFORMANCE OF SYSTEMS,
                 Measurement techniques.",
}

@Article{Bihari:1991:DAR,
  author =       "Thomas E. Bihari and Karsten Schwan",
  title =        "Dynamic Adaptation of Real-Time Software",
  journal =      j-TOCS,
  volume =       "9",
  number =       "2",
  pages =        "143--174",
  month =        may,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-2/p143-bihari/",
  abstract =     "In large, dynamic, real-time computer systems, it is
                 frequently most cost effective to employ different
                 software performance and reliability techniques at
                 different levels of granularity, at different times, or
                 within different subsystems. These techniques may
                 include regulation of redundancy and resource
                 allocation, multiversion and multipath execution,
                 adjustments of program attributes such as time-out
                 periods and others. The management of software in such
                 systems is a difficult task. Software that may be
                 adapted to meet varying performance and reliability
                 requirements offers a solution. A REal-time Software
                 Adaptation System (RESAS) includes a uniform model of
                 adaptable software and provides the tool necessary for
                 programmers to implement algorithms that choose and
                 enact adaptations in real time. RESAS has been
                 implemented on a testbed consisting of a multiprocessor
                 and an attached workstation, and adaptation algorithms
                 have been developed that address the problem of
                 adapting software to achieve two goals: software
                 execution within specified time constraints and
                 software resiliency with respect to computer hardware
                 failures.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance; reliability",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Real-time systems and embedded systems.
                 {\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS,
                 Real-time and embedded systems. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Measurements. {\bf
                 D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management.",
}

@Article{Bershad:1991:ULI,
  author =       "Brian N. Bershad and Thomas E. Anderson and Edward D.
                 Lazowska and Henry M. Levy",
  title =        "User-level Interprocess Communication for Shared
                 Memory Multiprocessors",
  journal =      j-TOCS,
  volume =       "9",
  number =       "2",
  pages =        "175--198",
  month =        may,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-2/p175-bershad/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.2} Software, OPERATING SYSTEMS, Storage Management.
                 {\bf C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors). {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Shared memory.",
}

@Article{Greenberg:1991:AUP,
  author =       "Albert G. Greenberg and Boris D. Lubachevsky and Isi
                 Mitrani",
  title =        "Algorithms for Unboundedly Parallel Simulations",
  journal =      j-TOCS,
  volume =       "9",
  number =       "3",
  pages =        "201--221",
  month =        aug,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-3/p201-greenberg/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance",
  subject =      "{\bf I.6.8} Computing Methodologies, SIMULATION AND
                 MODELING, Types of Simulation, Parallel. {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors).
                 {\bf F.1.2} Theory of Computation, COMPUTATION BY
                 ABSTRACT DEVICES, Modes of Computation. {\bf I.6.8}
                 Computing Methodologies, SIMULATION AND MODELING, Types
                 of Simulation.",
}

@Article{Wang:1991:ETD,
  author =       "Wen-Hann Wang and Jean-Loup Baer",
  title =        "Efficient Trace-Driven Simulation Methods for Cache
                 Performance Analysis",
  journal =      j-TOCS,
  volume =       "9",
  number =       "3",
  pages =        "222--241",
  month =        aug,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-3/p222-wang/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; measurement; performance",
  subject =      "{\bf B.3.3} Hardware, MEMORY STRUCTURES, Performance
                 Analysis and Design Aids**, Simulation**. {\bf B.3.2}
                 Hardware, MEMORY STRUCTURES, Design Styles. {\bf B.3.3}
                 Hardware, MEMORY STRUCTURES, Performance Analysis and
                 Design Aids**.",
}

@Article{Garcia-Molina:1991:ORM,
  author =       "H{\'e}ctor Garc{\'\i}a-Molina and Annemarie Spauster",
  title =        "Ordered and Reliable Multicast Communication",
  journal =      j-TOCS,
  volume =       "9",
  number =       "3",
  pages =        "242--271",
  month =        aug,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-3/p242-garcia-molina/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; reliability",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.1}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Architecture and Design. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management. {\bf
                 H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems.",
}

@Article{Schiper:1991:LCA,
  author =       "Andr{\'e} Schiper and Kenneth Birman and Pat
                 Stephenson",
  title =        "Lightweight Causal and Atomic Group Multicast",
  journal =      j-TOCS,
  volume =       "9",
  number =       "3",
  pages =        "272--314",
  month =        aug,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-3/p272-schiper/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; reliability",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.1}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Architecture and Design. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design.",
}

@Article{Larowe:1991:ECM,
  author =       "Richard P. {Larowe, Jr.} and Carla Schlatter Ellis",
  title =        "Experimental Comparison of Memory Management Policies
                 for {NUMA} Multiprocessors",
  journal =      j-TOCS,
  volume =       "9",
  number =       "4",
  pages =        "319--363",
  month =        nov,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Oct 31 06:27:19 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-4/p319-larowe/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "experimentation; management; measurement;
                 performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management. {\bf B.3.2} Hardware, MEMORY STRUCTURES,
                 Design Styles, Shared memory. {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors),
                 Multiple-instruction-stream, multiple-data-stream
                 processors (MIMD). {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance.",
}

@Article{Karn:1991:IRT,
  author =       "Phil Karn and Craig Partridge",
  title =        "Improving Round-Trip Time Estimates in Reliable
                 Transport Protocols",
  journal =      j-TOCS,
  volume =       "9",
  number =       "4",
  pages =        "364--373",
  month =        nov,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-4/p364-karn/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance; reliability",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol verification. {\bf C.2.1} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Architecture and Design, Packet-switching networks.
                 {\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Store and forward networks. {\bf D.4.4}
                 Software, OPERATING SYSTEMS, Communications Management,
                 Message sending. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management, Network
                 communication.",
}

@Article{Kandlur:1991:RBA,
  author =       "Dilip D. Kandlur and Kang G. Shin",
  title =        "Reliable Broadcast Algorithms for {HARTS}",
  journal =      j-TOCS,
  volume =       "9",
  number =       "4",
  pages =        "374--398",
  month =        nov,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-4/p374-kandlur/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance; reliability",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 HARTS. {\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols.",
}

@Article{Ahamad:1991:MV,
  author =       "Mustaque Ahamad and Mostafa H. Ammar and Shun Yan
                 Cheung",
  title =        "Multidimensional Voting",
  journal =      j-TOCS,
  volume =       "9",
  number =       "4",
  pages =        "399--431",
  month =        nov,
  year =         "1991",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1991-9-4/p399-ahamad/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; reliability; theory",
  subject =      "{\bf B.4.5} Hardware, INPUT/OUTPUT AND DATA
                 COMMUNICATIONS, Reliability, Testing, and
                 Fault-Tolerance**, Redundant design**. {\bf C.2.2}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Network Protocols. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems.",
}

@Article{Kistler:1992:DOC,
  author =       "James J. Kistler and M. Satyanarayanan",
  title =        "Disconnected Operation in the {Coda File System}",
  journal =      j-TOCS,
  volume =       "10",
  number =       "1",
  pages =        "3--25",
  month =        feb,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p3-kistler/",
  abstract =     "{\em Disconnected operation\/} is a mode of operation
                 that enables a client to continue accessing critical
                 data during temporary failures of a shared data
                 repository. An important, though not exclusive,
                 application of disconnected operation is in supporting
                 portable computers. In this paper, we show that
                 disconnected operation is feasible, efficient and
                 usable by describing its design and implementation in
                 the Coda File System. The central idea behind our work
                 is that {\em caching of data}, now widely used for
                 performance, can also be exploited to improve {\em
                 availability.\/}",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance;
                 reliability",
  subject =      "{\bf D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management. {\bf D.4.3}
                 Software, OPERATING SYSTEMS, File Systems Management,
                 Distributed file systems. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Measurements.",
}

@Article{Rosenblum:1992:DIL,
  author =       "Mendel Rosenblum and John K. Ousterhout",
  title =        "The Design and Implementation of a Log-Structured File
                 System",
  journal =      j-TOCS,
  volume =       "10",
  number =       "1",
  pages =        "26--52",
  month =        feb,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p26-rosenblum/",
  abstract =     "This paper presents a new technique for disk storage
                 management called a {\em log-structured file system}. A
                 log-structured file system writes all modifications to
                 disk sequentially in a log-like structure, thereby
                 speeding up both file writing and crash recovery. The
                 log is the only structure on disk; it contains indexing
                 information so that files can be read back from the log
                 efficiently. In order to maintain large free areas on
                 disk for fast writing, we divide the log into{\em
                 segments\/}and use a {\em segment cleaner\/} to
                 compress the live information from heavily fragmented
                 segments. We present a series of simulations that
                 demonstrate the efficiency of a simple cleaning policy
                 based on cost and benefit. We have implemented a
                 prototype log-structured file system called Sprite LFS;
                 it outperforms current Unix file systems by an order of
                 magnitude for small-file writes while matching or
                 exceeding Unix performance for reads and large writes.
                 Even when the overhead for cleaning is included, Sprite
                 LFS can use 70\% of the disk bandwidth for writing,
                 whereas Unix file systems typically can use only
                 5-10\%.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; measurement; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Secondary storage. {\bf D.4.2} Software,
                 OPERATING SYSTEMS, Storage Management,
                 Allocation/deallocation strategies. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability,
                 Checkpoint/restart. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Simulation.
                 {\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Operational analysis. {\bf H.2.2} Information Systems,
                 DATABASE MANAGEMENT, Physical Design, Recovery and
                 restart. {\bf H.3.2} Information Systems, INFORMATION
                 STORAGE AND RETRIEVAL, Information Storage, File
                 organization.",
}

@Article{Anderson:1992:SAE,
  author =       "Thomas E. Anderson and Brian N. Bershad and Edward D.
                 Lazowska and Henry M. Levy",
  title =        "Scheduler Activations: Effective Kernel Support for
                 the User-Level Management of Parallelism",
  journal =      j-TOCS,
  volume =       "10",
  number =       "1",
  pages =        "53--79",
  month =        feb,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p53-anderson/",
  abstract =     "{\em Threads\/} are the vehicle for concurrency in
                 many approaches to parallel programming. Threads can be
                 supported either by the operating system kernel or by
                 user-level library code in the application address
                 space, but neither approach has been fully
                 satisfactory. This paper addresses this dilemma. First,
                 we argue that the performance of kernel threads is {\em
                 inherently\/} worse than that of user-level threads,
                 rather than this being an artifact of existing
                 implementations; managing parallelism at the user level
                 is essential to high-performance parallel computing.
                 Next, we argue that the problems encountered in
                 integrating user-level threads with other system
                 services is a consequence of the lack of kernel support
                 for user-level threads provided by contemporary
                 multiprocessor operating systems; kernel threads are
                 the {\em wrong abstraction\/} on which to support
                 user-level management of parallelism. Finally, we
                 describe the design, implementation, and performance of
                 a new kernel interface and user-level thread package
                 that together provide the same functionality as kernel
                 threads without compromising the performance and
                 flexibility advantages of user-level management of
                 parallelism.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management, Input/output. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance.",
}

@Article{Mogul:1992:NLS,
  author =       "Jeffrey C. Mogul",
  title =        "Network Locality at the Scale of Processes",
  journal =      j-TOCS,
  volume =       "10",
  number =       "2",
  pages =        "81--109",
  month =        may,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-2/p81-mogul/",
  abstract =     "Packets on a LAN can be viewed as a series of
                 references to and from the objects they address. The
                 amount of locality in this reference stream may be
                 critical to the efficiency of network implementations,
                 if the locality can be exploited through caching or
                 scheduling mechanisms. Most previous studies have
                 treated network locality with an addressing granularity
                 of networks or individual hosts. This paper describes
                 some experiments tracing locality at a finer grain,
                 looking at references to individual processes, and with
                 fine-grained time resolution. Observations of typical
                 LANs show high per-process locality; that is, packets
                 to a host usually arrive for the process that most
                 recently sent a packet, and often with little
                 intervening delay.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; measurement; performance",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Packet-switching networks. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS,
                 Measurement techniques. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Performance
                 attributes. {\bf C.2.5} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Local and Wide-Area
                 Networks. {\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 TCP/IP.",
}

@Article{OMalley:1992:DNA,
  author =       "Sean W. O'Malley and Larry L. Peterson",
  title =        "A Dynamic Network Architecture",
  journal =      j-TOCS,
  volume =       "10",
  number =       "2",
  pages =        "110--143",
  month =        may,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-2/p110-o_malley/",
  abstract =     "Network software is a critical component of any
                 distributed system. Because of its complexity, network
                 software is commonly layered into a hierarchy of
                 protocols, or more generally, into a {\em protocol
                 graph}. Typical protocol graphs---including those
                 standardized in the ISO and TCP/IP network
                 architectures---share three important properties; the
                 protocol graph is simple, the nodes of the graph
                 (protocols) encapsulate complex functionality, and the
                 topology of the graph is relatively static. This paper
                 describes a new way to organize network software that
                 differs from conventional architectures in all three of
                 these properties. In our approach, the protocol graph
                 is complex, individual protocols encapsulate a single
                 function, and the topology of the graph is dynamic. The
                 main contribution of this paper is to describe the
                 ideas behind our new architecture, illustrate the
                 advantages of using the architecture, and demonstrate
                 that the architecture results in efficient network
                 software.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf C.2.1} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Architecture and Design, Network communications.",
}

@Article{Ramanathan:1992:DTC,
  author =       "Parameswaran Ramanathan and Kang G. Shin",
  title =        "Delivery of Time-Critical Messages using a Multiple
                 Copy Approach",
  journal =      j-TOCS,
  volume =       "10",
  number =       "2",
  pages =        "144--166",
  month =        may,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-2/p144-ramanathan/",
  abstract =     "Reliable and timely delivery of messages between
                 processing nodes is essential in distributed real-time
                 systems. Failure to deliver a message within its
                 deadline usually forces the system to undertake a
                 recovery action, which introduces some cost (or
                 overhead) to the system. This recovery cost can be very
                 high, especially when the recovery action fails due to
                 lack of time or resources. Proposed in this paper is a
                 scheme to minimize the expected cost incurred as a
                 result of messages failing to meet their deadlines. The
                 scheme is intended for distributed real-time systems,
                 especially with a point-to-point interconnection
                 topology. The goal of minimizing the expected cost is
                 achieved by sending multiple copies of a message
                 through disjoint routes and thus increasing the
                 probability of successful message delivery within the
                 deadline. However, as the number of copies increases,
                 the message traffic on the network increases, thereby
                 increasing the delivery time for each of the copies.
                 There is therefore a tradeoff between the number of
                 copies of each message and the expected cost incurred
                 as a result of messages missing their deadlines. The
                 number of copies of each message to be sent is
                 determined by optimizing this tradeoff. Simulation
                 results for a hexagonal mesh and a hypercube topology
                 indicate that the expected cost can be lowered
                 substantially by the proposed scheme.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance; reliability",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Reliability, availability, and
                 serviceability. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 C.3} Computer Systems Organization, SPECIAL-PURPOSE AND
                 APPLICATION-BASED SYSTEMS, Real-time and embedded
                 systems.",
}

@Article{Hsu:1992:ESN,
  author =       "William Tsun-Yuk Hsu and Pen-Chung Yew",
  title =        "An Effective Synchronization Network for Hot-Spot
                 Accesses",
  journal =      j-TOCS,
  volume =       "10",
  number =       "3",
  pages =        "167--189",
  month =        aug,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-3/p167-hsu/",
  abstract =     "In large multiprocessor systems, fast synchronization
                 is crucial for high performance. However,
                 synchronization traffic tends to create ``hot-spots''
                 in shared memory and cause network congestion.
                 Multistage shuffle-exchange networks have been proposed
                 and built to handle synchronization traffic. Software
                 combining schemes have also been proposed to relieve
                 network congestion caused by hot-spots. However,
                 multistage combining networks could be very expensive
                 and software combining could be very slow. In this
                 paper, we propose a single-stage combining network to
                 handle synchronization traffic, which is separated from
                 the regular memory traffic. A single-stage combining
                 network has several advantages: (1) it is attractive
                 from an implementation perspective because only one
                 stage is needed(instead of log {\em N\/} stages); (2)
                 Only one network is needed to handle both forward and
                 returning requests; (3) combined requests are
                 distributed evenly through the network---the wait
                 buffer size is reduced; and (4) fast-finishing
                 algorithms [30] can be used to shorten the network
                 delay. Because of all these advantages, we show that a
                 single-stage combining network gives good performance
                 at a lower cost than a multistage combining network.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors),
                 Interconnection architectures. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems.",
}

@Article{Atkins:1992:ACC,
  author =       "M. S. Atkins and M. Y. Coady",
  title =        "Adaptable Concurrency Control for Atomic Data Types",
  journal =      j-TOCS,
  volume =       "10",
  number =       "3",
  pages =        "190--225",
  month =        aug,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-3/p190-atkins/",
  abstract =     "In many distributed systems concurrent access is
                 required to a shared object, where abstract object
                 servers may incorporate type-specific properties to
                 define consistency requirements. Each operation and its
                 outcome is treated as an event, and conflicts may occur
                 between different event types. Hence concurrency
                 control and synchronization are required at the
                 granularity of conflicting event types. With such a
                 fine granularity of locking, the occurrence of
                 conflicts is likely to be lower than with whole-object
                 locking, so optimistic techniques become more
                 attractive. This work describes the design,
                 implementation, and performance of servers for a shared
                 atomic object, a semiqueue, where each server employs
                 either pessimistic or optimistic locking techniques on
                 each conflicting event type. We compare the performance
                 of a purely optimistic server, a purely pessimistic
                 server, and a hybrid server which treats certain event
                 types optimistically and others pessimistically, to
                 demonstrate the most appropriate environment for using
                 pessimistic, optimistic, or hybrid control. We show
                 that the advantages of low overhead on optimistic
                 locking at low conflict levels is offset at higher
                 conflict levels by the wasted work done by aborted
                 transactions. To achieve optimum performance over the
                 whole range of conflict levels, an adaptable server is
                 required, whereby the treatment of conflicting event
                 types can be changed dynamically between optimistic and
                 pessimistic, according to various criteria depending on
                 the expected frequency of conflict. We describe our
                 implementations of adaptable servers which may allocate
                 concurrency control strategy on the basis of state
                 information, the history of conflicts encountered, or
                 by using preset transaction priorities. We show that
                 the adaptable servers perform almost as well as the
                 best of the purely optimistic, pessimistic, or hybrid
                 servers under the whole range of conflict levels,
                 showing the versatility and efficiency of the dynamic
                 servers. Finally we outline a general design
                 methodology for implementing adaptable concurrency
                 control in servers for atomic objects, illustrated
                 using an atomic shared B-tree.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Concurrency. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Distributed programming. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Abstract data types. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Concurrency.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Deadlocks. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Mutual exclusion. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Synchronization. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Simulation.
                 {\bf H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Concurrency. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Transaction processing.",
}

@Article{Glasgow:1992:LRA,
  author =       "Janice Glasgow and Glenn Macewen and Prakash
                 Panangaden",
  title =        "A Logic for Reasoning about Security",
  journal =      j-TOCS,
  volume =       "10",
  number =       "3",
  pages =        "226--264",
  month =        aug,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-3/p226-glasgow/",
  abstract =     "A formal framework called {\em Security Logic\/} ({\em
                 SL\/}) is developed for specifying and reasoning about
                 security policies and for verifying that system designs
                 adhere to such policies. Included in this modal logic
                 framework are definitions of {\em knowledge}, {\em
                 permission}, and {\em obligation}. Permission is used
                 to specify secrecy policies and obligation to specify
                 integrity policies. The combination of policies is
                 addressed and examples based on policies from the
                 current literature are given.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "security; theory",
  subject =      "{\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC
                 AND FORMAL LANGUAGES, Mathematical Logic. {\bf H.2.0}
                 Information Systems, DATABASE MANAGEMENT, General. {\bf
                 K.6.5} Computing Milieux, MANAGEMENT OF COMPUTING AND
                 INFORMATION SYSTEMS, Security and Protection. {\bf
                 D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection.",
}

@Article{Lampson:1992:ADS,
  author =       "Butler Lampson and Mart{\'\i}n Abadi and Michael
                 Burrows and Edward Wobber",
  title =        "Authentication in Distributed Systems: Theory and
                 Practice",
  journal =      j-TOCS,
  volume =       "10",
  number =       "4",
  pages =        "265--310",
  month =        nov,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-4/p265-lampson/",
  abstract =     "We describe a theory of authentication and a system
                 that implements it. Our theory is based on the notion
                 of principal and a ``speaks for'' relation between
                 principals. A simple principal either has a name or is
                 a communication channel; a compound principal can
                 express an adopted role or delegated authority. The
                 theory shows how to reason about a principal's
                 authority by deducing the other principals that it can
                 speak for; authenticating a channel is one important
                 application. We use the theory to explain many existing
                 and proposed security mechanisms. In particular, we
                 describe the system we have built. It passes principals
                 efficiently as arguments or results of remote procedure
                 calls, and it handles public and shared key encryption,
                 name lookup in a large name space, groups of
                 principals, program loading, delegation, access
                 control, and revocation.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "security; theory; verification",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Authentication. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection, Access
                 controls. {\bf D.4.6} Software, OPERATING SYSTEMS,
                 Security and Protection, Cryptographic controls. {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf E.3} Data, DATA ENCRYPTION. {\bf K.6.5} Computing
                 Milieux, MANAGEMENT OF COMPUTING AND INFORMATION
                 SYSTEMS, Security and Protection, Authentication.",
}

@Article{Anderson:1992:FSC,
  author =       "David P. Anderson and Yoshitomo Osawa and Ramesh
                 Govindan",
  title =        "A File System for Continuous Media",
  journal =      j-TOCS,
  volume =       "10",
  number =       "4",
  pages =        "311--337",
  month =        nov,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-4/p311-anderson/",
  abstract =     "The Continuous Media File System, CMFS, supports
                 real-time storage and retrieval of continuous media
                 data (digital audio and video) on disk. CMFS clients
                 read or write files in ``sessions,'' each with a
                 guaranteed minimum data rate. Multiple sessions,
                 perhaps with different rates, and non-real-time access
                 can proceed concurrently. CMFS addresses several
                 interrelated design issues; real-time semantics for
                 sessions, disk layout, an acceptance test for new
                 sessions, and disk scheduling policy. We use simulation
                 to compare different design choices.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, File organization. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, Access
                 methods. {\bf D.4.7} Software, OPERATING SYSTEMS,
                 Organization and Design, Real-time systems and embedded
                 systems. {\bf H.5.1} Information Systems, INFORMATION
                 INTERFACES AND PRESENTATION, Multimedia Information
                 Systems.",
}

@Article{Kessler:1992:PPA,
  author =       "R. E. Kessler and Mark D. Hill",
  title =        "Page Placement Algorithms for Large Real-Indexed
                 Caches",
  journal =      j-TOCS,
  volume =       "10",
  number =       "4",
  pages =        "338--359",
  month =        nov,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-4/p338-kessler/",
  abstract =     "When a computer system supports both paged virtual
                 memory and large real-indexed caches, cache performance
                 depends in part on the main memory page placement. To
                 date, most operating systems place pages by selecting
                 an arbitrary page frame from a pool of page frames that
                 have been made available by the page replacement
                 algorithm. We give a simple model that shows that this
                 naive (arbitrary) page placement leads to up to 30\%
                 unnecessary cache conflicts. We develop several page
                 placement algorithms, called {\em careful-mapping
                 algorithms}, that try to select a page frame (from the
                 pool of available page frames) that is likely to reduce
                 cache contention. Using trace-driven simulation, we
                 find that careful mapping results in 10-20\% fewer
                 (dynamic) cache misses than naive mapping (for a
                 direct-mapped real-indexed multimegabyte cache). Thus,
                 our results suggest that careful mapping by the
                 operating system can get about half the cache miss
                 reduction that a cache size (or associativity) doubling
                 can.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; measurement; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Allocation/deallocation strategies. {\bf
                 B.3.2} Hardware, MEMORY STRUCTURES, Design Styles,
                 Cache memories. {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Virtual memory. {\bf B.3.3}
                 Hardware, MEMORY STRUCTURES, Performance Analysis and
                 Design Aids**, Simulation**. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management. {\bf
                 E.2} Data, DATA STORAGE REPRESENTATIONS. {\bf D.4.2}
                 Software, OPERATING SYSTEMS, Storage Management, Main
                 memory.",
}

@Article{Ladin:1992:PHA,
  author =       "Rivka Ladin and Barbara Liskov and Liuba Shrira and
                 Sanjay Ghemawat",
  title =        "Providing High Availability Using Lazy Replication",
  journal =      j-TOCS,
  volume =       "10",
  number =       "4",
  pages =        "360--391",
  month =        nov,
  year =         "1992",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-4/p360-ladin/",
  abstract =     "To provide high availability for services such as mail
                 or bulletin boards, data must be replicated. One way to
                 guarantee consistency of replicated data is to force
                 service operations to occur in the same order at all
                 sites, but this approach is expensive. For some
                 applications a weaker causal operation order can
                 preserve consistency while providing better
                 performance. This paper describes a new way of
                 implementing causal operations. Our technique also
                 supports two other kinds of operations: operations that
                 are totally ordered with respect to one another and
                 operations that are totally ordered with respect to all
                 other operations. The method performs well in terms of
                 response time, operation-processing capacity, amount of
                 stored state, and number and size of messages; it does
                 better than replication methods based on reliable
                 multicast techniques.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance; reliability",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Distributed systems. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed databases. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Reliability,
                 availability, and serviceability. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability, Fault-tolerance. {\bf
                 H.2.2} Information Systems, DATABASE MANAGEMENT,
                 Physical Design, Recovery and restart. {\bf H.2.4}
                 Information Systems, DATABASE MANAGEMENT, Systems,
                 Concurrency. {\bf H.2.4} Information Systems, DATABASE
                 MANAGEMENT, Systems, Distributed databases.",
}

@Article{Eager:1993:CER,
  author =       "Derek L. Eager and John Jahorjan",
  title =        "Chores: Enhanced Run-Time Support for Shared-Memory
                 Parallel Computing",
  journal =      j-TOCS,
  volume =       "11",
  number =       "1",
  pages =        "1--32",
  month =        feb,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p1-eager/",
  abstract =     "Parallel computing is increasingly important in the
                 solution of large-scale numerical problems. The
                 difficulty of efficiently hand-coding parallelism, and
                 the limitations of parallelizing compilers, have
                 nonetheless restricted its use by scientific
                 programmers. In this paper we propose a new paradigm,
                 {\em chores}, for the run-time support of parallel
                 computing on shared-memory multiprocessors. We consider
                 specifically uniform memory access shared-memory
                 environments, although the chore paradigm should also
                 be appropriate for use within the clusters of a
                 large-scale nonuniform memory access machine. We argue
                 that chore systems attain both the high efficiency of
                 compiler approaches for the common case of data
                 parallelism, and the flexibility and performance of
                 user-level thread approaches for functional
                 parallelism. These benefits are achieved within a
                 single, simple conceptual model that almost entirely
                 relieves the programmer and compiler from concerns of
                 granularity, scheduling, and enforcement of
                 synchronization constraints. Measurements of a
                 prototype implementation demonstrate that the chore
                 model can be supported more efficiently than can
                 traditional approaches to either data or functional
                 parallelism alone.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management. {\bf D.4.9} Software, OPERATING SYSTEMS,
                 Systems Programs and Utilities. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS. {\bf
                 C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS.",
}

@Article{Gheith:1993:CKS,
  author =       "Ahmed Gheith and Karsten Schwan",
  title =        "{CHAOS$^{\rm arc}$}: Kernel Support for Multiweight
                 Objects, Invocations, and Atomicity in Real-Time
                 Multiprocessor Applications",
  journal =      j-TOCS,
  volume =       "11",
  number =       "1",
  pages =        "33--72",
  month =        feb,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p33-gheith/",
  abstract =     "CHAOSarc is an object-based multiprocessor operating
                 system kernel that provides primitives with which
                 programmers may easily construct objects of differing
                 types and object invocations of differing semantics,
                 targeting multiprocessor systems, and real-time
                 applications. The CHAOSarc can {\em guarantee\/}
                 desired performance and functionality levels of
                 selected computations in real-time applications. Such
                 guarantees can be made despite possible uncertainty in
                 execution environments by allowing programs to {\em
                 adapt\/} in performance and functionality to varying
                 operating conditions. This paper reviews the primitives
                 offered by CHAOSarc and demonstrates how the required
                 elements of the CHAOSarc real-time kernel are
                 constructed with those primitives.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Real-time systems and embedded systems.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management. {\bf J.7} Computer Applications, COMPUTERS
                 IN OTHER SYSTEMS, Real time. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Concurrent programming structures. {\bf C.3}
                 Computer Systems Organization, SPECIAL-PURPOSE AND
                 APPLICATION-BASED SYSTEMS, Real-time and embedded
                 systems. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements.",
}

@Article{Kaashoek:1993:FIP,
  author =       "M. Frans Kaashoek and Robbert van Renesse and Hans van
                 Staveren and Andrew S. Tanenbaum",
  title =        "{FLIP}: An Internetwork Protocol for Supporting
                 Distributed Systems",
  journal =      j-TOCS,
  volume =       "11",
  number =       "1",
  pages =        "73--106",
  month =        feb,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p73-kaashoek/",
  abstract =     "Most modern network protocols give adequate support
                 for traditional applications such as file transfer and
                 remote login. Distributed applications, however, have
                 different requirements (e.g., efficient at-most-once
                 remote procedure call even in the face of processor
                 failures). Instead of using ad hoc protocols to meet
                 each of the new requirements, we have designed a new
                 protocol, called the Fast Local Internet Protocol
                 (FLIP), that provides a clean and simple integrated
                 approach to these new requirements. FLIP is an
                 unreliable message protocol that provides both
                 point-to-point communication and multicast
                 communication, and requires almost no network
                 management. Furthermore, by using FLIP we have
                 simplified higher-level protocols such as remote
                 procedure call and group communication, and enhanced
                 support for process migration and security. A prototype
                 implementation of FLIP has been built as part of the
                 new kernel for the Amoeba distributed operating system,
                 and is in daily use. Measurements of its performance
                 are presented.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; management; measurement; performance",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols.
                 {\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Distributed systems. {\bf D.4.0} Software,
                 OPERATING SYSTEMS, General, Amoeba. {\bf D.4.4}
                 Software, OPERATING SYSTEMS, Communications
                 Management.",
}

@Article{Gopalakrishnan:1993:DVR,
  author =       "Ganesh Gopalakrishnan and Richard Fujimoto",
  title =        "Design and Verification of the {Rollback Chip} using
                 {HOP}: a Case Study of Formal Methods Applied to
                 Hardware Design",
  journal =      j-TOCS,
  volume =       "11",
  number =       "2",
  pages =        "109--145",
  month =        may,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p109-gopalakrishnan/",
  abstract =     "The use of formal methods in hardware design improves
                 the quality of designs in many ways: it promotes better
                 understanding of the design; it permits systematic
                 design refinement through the discovery of invariants;
                 and it allows design verification (informal or formal).
                 In this paper we illustrate the use of formal methods
                 in the design of a custom hardware system called the
                 ``Rollback Chip'' (RBC), conducted using a simple
                 hardware design description language called ``HOP''. An
                 informal specification of the requirements of the RBC
                 is first given, followed by a {\em behavioral
                 description\/} of the RBC stating its {\em desired
                 behavior}. The behavioral description is refined into
                 progressively more efficient designs, terminating in a
                 {\em structural description}. Key refinement steps are
                 based on system invariants that are discovered during
                 the design, and proved correct during design
                 verification. The first step in design verification is
                 to apply a program called PARCOMP to {\em derive\/} a
                 behavioral description from the structural description
                 of the RBC. The derived behavior is then compared
                 against the desired behavior using equational
                 verification techniques. This work demonstrates that
                 formal methods can be fruitfully applied to a
                 nontrivial hardware design. It also illustrates the
                 particular advantages of our approach based on HOP and
                 PARCOMP. Last, but not the least, it formally verifies
                 the RBC mechanism itself.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; languages; theory; verification",
  subject =      "{\bf B.7.2} Hardware, INTEGRATED CIRCUITS, Design
                 Aids, Verification. {\bf B.6.3} Hardware, LOGIC DESIGN,
                 Design Aids, Hardware description languages. {\bf
                 B.7.1} Hardware, INTEGRATED CIRCUITS, Types and Design
                 Styles. {\bf B.7.2} Hardware, INTEGRATED CIRCUITS,
                 Design Aids, Simulation.",
}

@Article{McCann:1993:DPA,
  author =       "Cathy McCann and Raj Vaswani and John Zahorjan",
  title =        "A Dynamic Processor Allocation Policy for
                 Multiprogrammed Shared-Memory Multiprocessors",
  journal =      j-TOCS,
  volume =       "11",
  number =       "2",
  pages =        "146--178",
  month =        may,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p146-mccann/",
  abstract =     "We propose and evaluate empirically the performance of
                 a dynamic processor-scheduling policy for
                 multiprogrammed shared-memory multiprocessors. The
                 policy is dynamic in that it reallocates processors
                 from one parallel job to another based on the currently
                 realized parallelism of those jobs. The policy is
                 suitable for implementation in production systems in
                 that: ---It interacts well with very efficient
                 user-level thread packages, leaving to them many
                 low-level thread operations that do not require kernel
                 intervention. ---It deals with thread blocking due to
                 user I/O and page faults. ---It ensures fairness in
                 delivering resources to jobs. ---Its performance,
                 measured in terms of average job response time, is
                 superior to that of previously proposed schedulers,
                 including those implemented in existing systems. It
                 provides good performance to very short, sequential
                 (e.g., interactive) requests. We have evaluated our
                 scheduler and compared it to alternatives using a set
                 of prototype implementations running on a Sequent
                 Symmetry multiprocessor. Using a number of parallel
                 applications with distinct qualitative behaviors, we
                 have both evaluated the policies according to the major
                 criterion of overall performance and examined a number
                 of more general policy issues, including the advantage
                 of ``space sharing'' over ``time sharing'' the
                 processors of a multiprocessor, and the importance of
                 cooperation between the kernel and the application in
                 reallocating processors between jobs. We have also
                 compared the policies according to other criteia
                 important in real implementations, in particular,
                 fairness and respone time to short, sequential
                 requests. We conclude that a combination of performance
                 and implementation considerations makes a compelling
                 case for our dynamic scheduling policy.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors).",
}

@Article{Thekkath:1993:LLL,
  author =       "Chandramohan A. Thekkath and Henry M. Levy",
  title =        "Limits to Low-Latency Communication on High-Speed
                 Networks",
  journal =      j-TOCS,
  volume =       "11",
  number =       "2",
  pages =        "179--203",
  month =        may,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p179-thekkath/",
  abstract =     "The throughput of local area networks is rapidly
                 increasing. For example, the bandwidth of new ATM
                 networks and FDDI token rings is an order of magnitude
                 greater than that of Ethernets. Other network
                 technologies promise a bandwidth increase of yet
                 another order of magnitude in several years. However,
                 in distributed systems, lowered latency rather than
                 increased throughput is often of primary concern. This
                 paper examines the system-level effects of newer
                 high-speed network technologies on low-latency,
                 cross-machine communications. To evaluate a number of
                 influences, both hardware and software, we designed and
                 implemented a new remote procedure call system targeted
                 at providing low latency. We then ported this system to
                 several hardware platforms (DECstation and
                 SPARCstation) with several different networks and
                 controllers (ATM, FDDI, and Ethernet). Comparing these
                 systems allows us to explore the performance impact of
                 alternative designs in the communication system with
                 respect to achieving low latency, e.g., the network,
                 the network controller, the hose architecture and cache
                 system, and the kernel and user-level runtime software.
                 Our RPC system, which achieves substantially reduced
                 call times (170 [mu]seconds on an ATM network using
                 DECstation 5000/200 hosts), allows us to isolate those
                 components of next-generation networks and controllers
                 that still stand in the way of low-latency
                 communication. We demonstrate that new-generation
                 processor technology and software design can reduce
                 small-packet RPC times to near network-imposed limits,
                 making network and controller design more crucial than
                 ever to achieving truly low-latency communication.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Network communication. {\bf
                 D.4.4} Software, OPERATING SYSTEMS, Communications
                 Management, Message sending. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf B.4.2} Hardware, INPUT/OUTPUT AND DATA
                 COMMUNICATIONS, Input/Output Devices, Channels and
                 controllers. {\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf C.2.1} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Architecture and Design. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems.",
}

@Article{Ammann:1993:DTG,
  author =       "Paul Ammann and Sushil Jajodia",
  title =        "Distributed Timestamp Generation in Planar Lattice
                 Networks",
  journal =      j-TOCS,
  volume =       "11",
  number =       "3",
  pages =        "205--225",
  month =        aug,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p205-ammann/",
  abstract =     "Timestamps are considered for distributed environments
                 in which information flow is restricted to one
                 direction through a planar lattice imposed on a
                 network. For applications in such networks, existing
                 timestamping algorithms require extension and
                 modification. For example, in secure environments,
                 typical timestamps provide a potential signaling
                 channel between incomparable levels. In hierarchical
                 databases, typical timestamps cause peripheral sites to
                 unnecessarily affect the behavior at main sites.
                 Algorithms are presented by which a network node may
                 generate and compare timestamps using timestamp
                 components maintained at dominated nodes in the
                 network. The comparison relation is shown to be acyclic
                 for timestamps produced by the generation algorithm. We
                 discuss ways to safely relax the requirement that the
                 network be a lattice. By example, we show how to modify
                 a simple nonplanar lattice so that the generation
                 algorithm can be applied. Uses of the timestamp
                 generation algorithm in the motivating applications are
                 outlined.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; security",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Distributed applications. {\bf C.2.0} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, General,
                 Security and protection (e.g., firewalls). {\bf G.2.m}
                 Mathematics of Computing, DISCRETE MATHEMATICS,
                 Miscellaneous. {\bf D.4.6} Software, OPERATING SYSTEMS,
                 Security and Protection, Information flow controls.
                 {\bf H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Concurrency. {\bf H.2.4} Information Systems,
                 DATABASE MANAGEMENT, Systems, Distributed databases.",
}

@Article{Anderson:1993:MCM,
  author =       "David P. Anderson",
  title =        "Metascheduling for Continuous Media",
  journal =      j-TOCS,
  volume =       "11",
  number =       "3",
  pages =        "226--252",
  month =        aug,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p226-anderson/",
  abstract =     "Next-generation distributed systems will support {\em
                 continuous media\/} (digital audio and video) in the
                 same framework as other data. Many applications that
                 use continuous media need guaranteed end-to-end
                 performance (bounds on throughput and delay). To
                 reliably support these requirements, system components
                 such as CPU schedulers, networks, and file systems must
                 offer performance guarantees. A {\em metascheduler\/}
                 coordinates these components, negotiating end-to-end
                 guarantees on behalf of clients. The {\em CM-resource
                 model}, described in this paper, provides a basis for
                 such a metascheduler. It defines a workload
                 parameterization, an abstract interface to resources,
                 and an algorithm for reserving multiple resources. The
                 model uses an economic approach to dividing end-to-end
                 delay, and it allows system components to ``work
                 ahead,'' improving the performance of nonreal-time
                 workload.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; economics; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Performance attributes. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management,
                 Buffering. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Network communication. {\bf
                 D.4.7} Software, OPERATING SYSTEMS, Organization and
                 Design, Real-time systems and embedded systems. {\bf
                 H.5.1} Information Systems, INFORMATION INTERFACES AND
                 PRESENTATION, Multimedia Information Systems, Audio
                 input/output. {\bf H.5.1} Information Systems,
                 INFORMATION INTERFACES AND PRESENTATION, Multimedia
                 Information Systems, Video (e.g., tape, disk, DVI).",
}

@Article{Lim:1993:WAS,
  author =       "Beng-Hong Lim and Anant Agarwal",
  title =        "Waiting Algorithms for Synchronization in Large-Scale
                 Multiprocessors",
  journal =      j-TOCS,
  volume =       "11",
  number =       "3",
  pages =        "253--294",
  month =        aug,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p253-lim/",
  abstract =     "Through analysis and experiments, this paper
                 investigates two-phase waiting algorithms to minimize
                 the cost of waiting for synchronization in large-scale
                 multiprocessors. In a two-phase algorithm, a thread
                 first waits by polling a synchronization variable. If
                 the cost of polling reaches a limit {\em Lpoll\/} and
                 further waiting is necessary, the thread is blocked,
                 incurring an additional fixed cost, {\em B}. The choice
                 of {\em Lpoll\/} is a critical determinant of the
                 performance of two-phase algorithms. We focus on
                 methods for statically determining {\em Lpoll\/}
                 because the run-time overhead of dynamically
                 determining {\em Lpoll\/} can be comparable to the cost
                 of blocking in large-scale multiprocessor systems with
                 lightweight threads. Our experiments show that {\em
                 always-block\/} ({\em Lpoll\/} = 0) is a good waiting
                 algorithm with performance that is usually close to the
                 best of the algorithms compared. We show that even
                 better performance can be achieved with a static choice
                 of {\em Lpoll\/} based on knowledge of likely wait-time
                 distributions. Motivated by the observation that
                 different synchronization types exhibit different
                 wait-time distributions, we prove that a static choice
                 of {\em Lpoll\/} can yield close to optimal on-line
                 performance against an adversary that is restricted to
                 choosing wait times from a fixed family of probability
                 distributions. This result allows us to make an optimal
                 static choice of {\em Lpoll\/} based on synchronization
                 type. For exponentially distributed wait times, we
                 prove that setting {\em Lpoll\/} = 1n(e-1){\em B\/}
                 results in a waiting cost that is no more than {\em
                 e/(e-1)\/} times the cost of an optimal off-line
                 algorithm. For uniformly distributed wait times, we
                 prove that setting {\em L\/}poll=1/2(square root of 5
                 -1){\em B\/} results in a waiting cost that is no more
                 than (square root of 5 + 1)/2 (the golden ratio) times
                 the cost of an optimal off-line algorithm. Experimental
                 measurements of several parallel applications on the
                 Alewife multiprocessor simulator corroborate our
                 theoretical findings.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; performance; theory",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf D.4.1} Software,
                 OPERATING SYSTEMS, Process Management, Mutual
                 exclusion. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors), Parallel
                 processors**. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Stochastic analysis.",
}

@Article{Hill:1993:CSM,
  author =       "Mark D. Hill and James R. Larus and Steven K.
                 Reinhardt and David A. Wood",
  title =        "Cooperative Shared Memory: Software and Hardware for
                 Scalable Multiprocessors",
  journal =      j-TOCS,
  volume =       "11",
  number =       "4",
  pages =        "300--318",
  month =        nov,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-4/p300-hill/",
  abstract =     "We believe the paucity of massively parallel,
                 shared-memory machines follows from the lack of a
                 shared-memory programming performance model that can
                 inform programmers of the cost of operations (so they
                 can avoid expensive ones) and can tell hardware
                 designers which cases are common (so they can build
                 simple hardware to optimize them). Cooperative shared
                 memory, our approach to shared-memory design, addresses
                 this problem. Our initial implementation of cooperative
                 shared memory uses a simple programming model, called
                 Check-In/Check-Out (CICO), in conjunction with even
                 simpler hardware, called Dir1SW. In CICO, programs
                 bracket uses of shared data with a check\_in directive
                 terminating the expected use of the data. A cooperative
                 prefetch directive helps hide communication latency.
                 Dir1SW is a minimal directory protocol that adds little
                 complexity to message-passing hardware, but efficiently
                 supports programs written within the CICO model.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Shared memory. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Simulation**. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors). {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors),
                 Parallel processors**. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Design studies.
                 {\bf C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS, Modeling techniques. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Parallel programming.",
}

@Article{Anderson:1993:HSS,
  author =       "Thomas E. Anderson and Susan S. Owicki and James B.
                 Saxe and Charles P. Thacker",
  title =        "High-speed Switch Scheduling for Local-Area Networks",
  journal =      j-TOCS,
  volume =       "11",
  number =       "4",
  pages =        "319--352",
  month =        nov,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-4/p319-anderson/",
  abstract =     "Current technology trends make it possible to build
                 communication networks that can support
                 high-performance distributed computing. This paper
                 describes issues in the design of a prototype switch
                 for an arbitrary topology point-to-point network with
                 link speeds of up to 1 Gbit/s. The switch deals in
                 fixed-length ATM-style cells, which it can process at a
                 rate of 37 million cells per second. It provides high
                 bandwidth and low latency for datagram traffic. In
                 addition, it supports real-time traffic by providing
                 bandwidth reservations with guaranteed latency bounds.
                 The key to the switch's operation is a technique called
                 {\em parallel iterative matching}, which can quickly
                 identify a set of conflict-free cells for transmission
                 in a time slot. Bandwidth reservations are accommodated
                 in the switch by building a fixed schedule for
                 transporting cells from reserved flows across the
                 switch; parallel iterative matching can fill unused
                 slots with datagram traffic. Finally, we note that
                 parallel iterative matching may not allocate bandwidth
                 fairly among flows of datagram traffic. We describe a
                 technique called {\em statistical matching}, which can
                 be used to ensure fairness at the switch and to support
                 applications with rapidly changing needs for guaranteed
                 bandwidth.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; experimentation; performance",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.2.5}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Local and Wide-Area Networks, Access schemes.
                 {\bf G.2.2} Mathematics of Computing, DISCRETE
                 MATHEMATICS, Graph Theory, Graph algorithms.",
}

@Article{Li:1993:ANL,
  author =       "Wei Li and Keshav Pingali",
  title =        "Access Normalization: Loop Restructuring for {NUMA}
                 Computers",
  journal =      j-TOCS,
  volume =       "11",
  number =       "4",
  pages =        "353--375",
  month =        nov,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-4/p353-li/",
  abstract =     "In scalable parallel machines, processors can make
                 local memory accesses much faster than they can make
                 remote memory accesses. Additionally, when a number of
                 remote accesses must be made, it is usually more
                 efficient to use block transfers of data rather than to
                 use many small messages. To run well on such machines,
                 software must exploit these features. We believe it is
                 too onerous for a programmer to do this by hand, so we
                 have been exploring the use of restructuring compiler
                 technology for this purpose. In this article, we start
                 with a language like HPF-Fortran with user-specified
                 data distribution and develop a systematic loop
                 transformation strategy called {\em access
                 normalization\/} that restructures loop nests to
                 exploit locality and block transfers. We demonstrate
                 the power of our techniques using routines from the
                 BLAS (Basic Linear Algebra Subprograms) library. An
                 important feature of our approach is that we model loop
                 transformation using {\em invertible\/} matrices and
                 integer lattice theory.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; languages; performance",
  subject =      "{\bf C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors), Multiple-instruction-stream,
                 multiple-data-stream processors (MIMD). {\bf D.1.3}
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming. {\bf D.3.4}
                 Software, PROGRAMMING LANGUAGES, Processors, Compilers.
                 {\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Optimization. {\bf D.3.4} Software,
                 PROGRAMMING LANGUAGES, Processors, Code generation.",
}

@Article{Mahlke:1993:SSM,
  author =       "Scott A. Mahlke and William Y. Chen and Roger A.
                 Bringmann and Richard E. Hank and Wen-Mei W. Hwu and B.
                 Ramakrishna Rau and Michael S. Schlansker",
  title =        "Sentinel Scheduling: a Model for Compiler-Controlled
                 Speculative Execution",
  journal =      j-TOCS,
  volume =       "11",
  number =       "4",
  pages =        "376--408",
  month =        nov,
  year =         "1993",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-4/p376-mahlke/",
  abstract =     "Speculative execution is an important source of
                 parallelism for VLIW and superscalar processors. A
                 serious challenge with compiler-controlled speculative
                 execution is to efficiently handle exceptions for
                 speculative instructions. In this article, a set of
                 architectural features and compile-time scheduling
                 support collectively referred to as {\em sentinel
                 scheduling\/} is introduced. Sentinel scheduling
                 provides an effective framework for both
                 compiler-controlled speculative execution and exception
                 handling. All program exceptions are accurately
                 detected and reported in a timely manner with sentinel
                 scheduling. Recovery from exceptions is also ensured
                 with the model. Experimental results show the
                 effectiveness of sentinel scheduling for exploiting
                 instruction-level parallelism and overhead associated
                 with exception handling.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Associative memories. {\bf C.0} Computer
                 Systems Organization, GENERAL, Hardware/software
                 interfaces. {\bf C.0} Computer Systems Organization,
                 GENERAL, Instruction set design. {\bf C.0} Computer
                 Systems Organization, GENERAL, System architectures.
                 {\bf C.1.1} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Single Data Stream Architectures,
                 Pipeline processors**. {\bf D.2.5} Software, SOFTWARE
                 ENGINEERING, Testing and Debugging, Error handling and
                 recovery. {\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Code generation. {\bf D.3.4} Software,
                 PROGRAMMING LANGUAGES, Processors, Compilers. {\bf
                 D.3.4} Software, PROGRAMMING LANGUAGES, Processors,
                 Optimization.",
}

@Article{Wobber:1994:ATO,
  author =       "Edward Wobber and Mart{\'\i}n Abadi and Michael
                 Burrows and Butler Lampson",
  title =        "Authentication in the {Taos} Operating System",
  journal =      j-TOCS,
  volume =       "12",
  number =       "1",
  pages =        "3--32",
  month =        feb,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-1/p3-wobber/",
  abstract =     "We describe a design for security in a distributed
                 system and its implementation. In our design,
                 applications gain access to security services through a
                 narrow interface. This interface provides a notion of
                 identity that includes simple principals, groups,
                 roles, and delegations. A new operating system
                 component manages principals, credentials, and secure
                 channels. It checks credentials according to the formal
                 rules of a logic of authentication. Our implementation
                 is efficient enough to support a substantial user
                 community.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; security; theory",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Authentication. {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf D.4.6} Software, OPERATING
                 SYSTEMS, Security and Protection, Access controls.",
}

@Article{Satyanarayanan:1994:LRV,
  author =       "M. Satyanarayanan and Henry H. Mashburn and Puneet
                 Kumar and David C. Steere and James J. Kistler",
  title =        "Lightweight Recoverable Virtual Memory",
  journal =      j-TOCS,
  volume =       "12",
  number =       "1",
  pages =        "33--57",
  month =        feb,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-1/p33-satyanarayanan/",
  abstract =     "{\em Recoverable virtual memory\/}refers to regions of
                 a virtual address space on which transactional
                 guarantees are offered. This article describes RVM, an
                 efficient, portable, and easily used implementation of
                 recoverable virtual memory for Unix environments. A
                 unique characteristic of RVM is that it allows
                 independent control over the transactional properties
                 of atomicity, permanence, and serializability. This
                 leads to considerable flexibility in the use of RVM,
                 potentially enlarging the range of applications that
                 can benefit from transactions. It also simplifies the
                 layering of functionality such as nesting and
                 distribution. The article shows that RVM performs well
                 over its intended range of usage even though it does
                 not benefit from specialized operating system support.
                 It also demonstrates the importance of intra- and
                 inter-transaction optimizations.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance;
                 reliability",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Virtual memory. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability, Fault-tolerance. {\bf
                 D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Measurements. {\bf H.2.2} Information Systems, DATABASE
                 MANAGEMENT, Physical Design, Recovery and restart. {\bf
                 H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Transaction processing.",
}

@Article{Heidemann:1994:FSD,
  author =       "John S. Heidemann and Gerald J. Popek",
  title =        "File-system Development with Stackable Layers",
  journal =      j-TOCS,
  volume =       "12",
  number =       "1",
  pages =        "58--89",
  month =        feb,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-1/p58-heidemann/",
  abstract =     "Filing services have experienced a number of
                 innovations in recent years, but many of these
                 promising ideas have failed to enter into broad use.
                 One reason is that current filing environments present
                 several barriers to new development. For example, file
                 systems today typically stand alone instead of building
                 on the work of others, and support of new filing
                 services often requires changes that invalidate
                 existing work. Stackable file-system design addresses
                 these issues in several ways. Complex filing services
                 are constructed from layer ``building blocks,'' each of
                 which may be provided by independent parties. There are
                 no syntactic constraints to layer order, and layers can
                 occupy different address spaces, allowing very flexible
                 layer configuration. Independent layer evolution and
                 development are supported by an extensible interface
                 bounding each layer. This paper discusses stackable
                 layering in detail and presents design techniques it
                 enables. We describe an implementation providing these
                 facilities that exhibits very high performance. By
                 lowering barriers to new filing design, stackable
                 layering offers the potential of broad third-party
                 file-system development not feasible today.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Maintenance**. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design,
                 Hierarchical design**. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements.",
}

@Article{Attiya:1994:SCV,
  author =       "Hagit Attiya and Jennifer L. Welch",
  title =        "Sequential Consistency versus Linearizability",
  journal =      j-TOCS,
  volume =       "12",
  number =       "2",
  pages =        "91--122",
  month =        may,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-2/p91-attiya/",
  abstract =     "The power of two well-known consistency conditions for
                 shared-memory multiprocessors, {\em sequential
                 consistency\/} and {\em linearizability}, is compared.
                 The cost measure studied is the worst-case response
                 time in distributed implementations of virtual shared
                 memory supporting one of the two conditions. Three
                 types of shared-memory objects are considered:
                 read/write objects, FIFO queues, and stacks. If clocks
                 are only approximately synchronized (or do not exist),
                 then for all three object types it is shown that
                 linearizability is more expensive than sequential
                 consistency. We show that, for all three data types,
                 the worst-case response time is very sensitive to the
                 assumptions that are made about the timing information
                 available to the system. Under the strong assumption
                 that processes have perfectly synchronized clocks, it
                 is shown that sequential consistency and
                 linearizability are equally costly. We present upper
                 bounds for linearizability and matching lower bounds
                 for sequential consistency. The upper bounds are shown
                 by presenting algorithms that use atomic broadcast in a
                 modular fashion. The lower-bound proofs for the
                 approximate case use the technique of ``shifting,''
                 first introduced for studying the clock synchronization
                 problem.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming, Distributed programming. {\bf
                 D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Concurrent programming
                 structures. {\bf D.4.2} Software, OPERATING SYSTEMS,
                 Storage Management, Distributed memories. {\bf F.1.2}
                 Theory of Computation, COMPUTATION BY ABSTRACT DEVICES,
                 Modes of Computation, Parallelism and concurrency. {\bf
                 H.2.4} Information Systems, DATABASE MANAGEMENT,
                 Systems, Distributed databases. {\bf H.2.4} Information
                 Systems, DATABASE MANAGEMENT, Systems, Concurrency.",
}

@Article{Mann:1994:CDF,
  author =       "Timothy Mann and Andrew Birrell and Andy Hisgen and
                 Charles Jerian and Garret Swart",
  title =        "A Coherent Distributed File Cache with Directory
                 Write-Behind",
  journal =      j-TOCS,
  volume =       "12",
  number =       "2",
  pages =        "123--164",
  month =        may,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-2/p123-mann/",
  abstract =     "Extensive caching is a key feature of the Echo
                 distributed file system. Echo client machines maintain
                 coherent caches of file and directory data and
                 properties, with write-behind (delayed write-back) of
                 {\em all\/} cached information. Echo specifies ordering
                 constraints on this write-behind, enabling applications
                 to store and maintain consistent data structures in the
                 file system even when crashes or network faults prevent
                 some writes from being completed. In this paper we
                 describe the Echo cache's coherence and ordering
                 semantics, show how they can improve the performance
                 and consistency of applications, explain how they are
                 implemented. We also discuss the general problem of
                 reliably notifying applications and users when
                 write-behind is lost; we addressed this problem as part
                 of the Echo design, but did not find a fully
                 satisfactory solution.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance;
                 reliability; security",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Distributed file systems.",
}

@Article{Uhlig:1994:DTS,
  author =       "Richard Uhlig and David Nagle and Tim Stanley and
                 Trevor Mudge and Stuart Sechrest and Richard Brown",
  title =        "Design Tradeoffs for Software-Managed {TLBs}",
  journal =      j-TOCS,
  volume =       "12",
  number =       "3",
  pages =        "175--205",
  month =        aug,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-3/p175-uhlig/",
  abstract =     "An increasing number of architectures provide virtual
                 memory support through software-managed TLBs. However,
                 software management can impose considerable penalties
                 that are highly dependent on the operating system's
                 structure and its use of virtual memory. This work
                 explores software-managed TLB design tradeoffs and
                 their interaction with a range of monolithic and
                 microkernel operating systems. Through hardware
                 monitoring and simulation, we explore TLB performance
                 for benchmarks running on a MIPS R2000-based
                 workstation running Ultrix, OSF/1, and three versions
                 of Mach 3.0.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Virtual memory. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Measurement
                 techniques. {\bf B.3.2} Hardware, MEMORY STRUCTURES,
                 Design Styles, Associative memories. {\bf B.3.2}
                 Hardware, MEMORY STRUCTURES, Design Styles, Cache
                 memories. {\bf B.3.2} Hardware, MEMORY STRUCTURES,
                 Design Styles, Virtual memory. {\bf B.3.3} Hardware,
                 MEMORY STRUCTURES, Performance Analysis and Design
                 Aids**, Simulation**. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements.",
}

@Article{Stodolsky:1994:PLD,
  author =       "Daniel Stodolsky and Mark Holland and William V.
                 {Courtright II} and Garth A. Gibson",
  title =        "Parity Logging Disk Arrays",
  journal =      j-TOCS,
  volume =       "12",
  number =       "3",
  pages =        "206--235",
  month =        aug,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-3/p206-stodolsky/",
  abstract =     "Parity-encoded redundant disk arrays provide highly
                 reliable, cost-effective secondary storage with high
                 performance for reads and large writes. Their
                 performance on small writes, however, is much worse
                 than mirrored disks---the traditional, highly reliable,
                 but expensive organization for secondary storage.
                 Unfortunately, small writes are a substantial portion
                 of the I/O workload of many important, demanding
                 applications such as on-line transaction processing.
                 This paper presents {\em parity logging}, a novel
                 solution to the small-write problem for redundant disk
                 arrays. Parity logging applies journalling techniques
                 to reduce substantially the cost of small writes. We
                 provide detailed models of parity logging and competing
                 schemes---mirroring, floating storage, and RAID level
                 5---and verify these models by simulation. Parity
                 logging provides performance competitive with
                 mirroring, but with capacity overhead close to the
                 minimum offered by RAID level 5. Finally, parity
                 logging can exploit data caching more effectively than
                 all three alternative approaches.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance; reliability",
  subject =      "{\bf B.4.2} Hardware, INPUT/OUTPUT AND DATA
                 COMMUNICATIONS, Input/Output Devices, Channels and
                 controllers. {\bf B.4.5} Hardware, INPUT/OUTPUT AND
                 DATA COMMUNICATIONS, Reliability, Testing, and
                 Fault-Tolerance**, Redundant design**.",
}

@Article{Cao:1994:TPR,
  author =       "Pei Cao and Swee Boon Lin and Shivakumar Venkataraman
                 and John Wilkes",
  title =        "The {TickerTAIP} Parallel {RAID} Architecture",
  journal =      j-TOCS,
  volume =       "12",
  number =       "3",
  pages =        "236--269",
  month =        aug,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-3/p236-cao/",
  abstract =     "Traditional disk arrays have a centralized
                 architecture, with a single controller through which
                 all requests flow. Such a controller is a single point
                 of failure, and its performance limits the maximum
                 number of disks to which the array can scale. We
                 describe TickerTAIP, a parallel architecture for disk
                 arrays that distributes the controller functions across
                 several loosely coupled processors. The result is
                 better scalability, fault tolerance, and flexibility.
                 This article presents the TickerTAIP architecture and
                 an evaluation of its behavior. We demonstrate the
                 feasibility by a working example, describe a family of
                 distributed algorithms for calculating RAID parity,
                 discuss techniques for establishing request atomicity,
                 sequencing, and recovery, and evaluate the performance
                 of the TickerTAIP design in both absolute terms and by
                 comparison to a centralized RAID implementation. We
                 also analyze the effects of including disk-level
                 request-scheduling algorithms inside the array. We
                 conclude that the Ticker TAIP architectural approach is
                 feasible, useful, and effective.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance; reliability",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Distributed systems. {\bf B.4.2} Hardware,
                 INPUT/OUTPUT AND DATA COMMUNICATIONS, Input/Output
                 Devices, Channels and controllers. {\bf D.1.3}
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming. {\bf D.4.2}
                 Software, OPERATING SYSTEMS, Storage Management,
                 Secondary storage. {\bf D.4.7} Software, OPERATING
                 SYSTEMS, Organization and Design, Distributed
                 systems.",
}

@Article{Chase:1994:SPS,
  author =       "Jeffrey S. Chase and Henry M. Levy and Michael J.
                 Feeley and Edward D. Lazowska",
  title =        "Sharing and Protection in a Single-Address-Space
                 Operating System",
  journal =      j-TOCS,
  volume =       "12",
  number =       "4",
  pages =        "271--307",
  month =        nov,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p271-chase/",
  abstract =     "This article explores memory sharing and protection
                 support in Opal, a single-address-space operating
                 system designed for wide-address (64-bit)
                 architectures. Opal threads execute within protection
                 domains in a single shared virtual address space.
                 Sharing is simplified, because addresses are context
                 independent. There is no loss of protection, because
                 addressability and access are independent; the right to
                 access a segment is determined by the protection domain
                 in which a thread executes. This model enables
                 beneficial code-and data-sharing patterns that are
                 currently prohibitive, due in part to the inherent
                 restrictions of multiple address spaces, and in part to
                 Unix programming style. We have designed and
                 implemented an Opal prototype using the Mach 3.0
                 microkernel as a base. Our implementation demonstrates
                 how a single-address-space structure can be supported
                 alongside of other environments on a modern microkernel
                 operating system, using modern wide-address
                 architectures. This article justifies the Opal model
                 and its goals for sharing and protection, presents the
                 system and its abstractions, describes the prototype
                 implementation, and reports experience with integrated
                 applications.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management. {\bf C.1.3} Computer Systems Organization,
                 PROCESSOR ARCHITECTURES, Other Architecture Styles,
                 Capability architectures**. {\bf D.3.3} Software,
                 PROGRAMMING LANGUAGES, Language Constructs and
                 Features, Modules, packages. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management. {\bf
                 D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Access controls. {\bf D.4.6} Software,
                 OPERATING SYSTEMS, Security and Protection, Information
                 flow controls. {\bf D.4.7} Software, OPERATING SYSTEMS,
                 Organization and Design. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Measurements. {\bf E.1}
                 Data, DATA STRUCTURES. {\bf E.2} Data, DATA STORAGE
                 REPRESENTATIONS.",
}

@Article{Chen:1994:NAP,
  author =       "Peter M. Chen and David A. Patterson",
  title =        "A New Approach to {I/O} Performance Evaluation:
                 Self-Scaling {I/O} Benchmarks, Predicted {I/O}
                 Performance",
  journal =      j-TOCS,
  volume =       "12",
  number =       "4",
  pages =        "308--339",
  month =        nov,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p308-chen/",
  abstract =     "Current I/O benchmarks suffer from several chronic
                 problems: they quickly become obsolete; they do not
                 stress the I/O system; and they do not help much in
                 understanding I/O system performance. We propose a new
                 approach to I/O performance analysis. First, we propose
                 a self-scaling benchmark that dynamically adjusts
                 aspects of its workload according to the performance
                 characteristic of the system being measured. By doing
                 so, the benchmark automatically scales across current
                 and future systems. The evaluation aids in
                 understanding system performance by reporting how
                 performance varies according to each of five workload
                 parameters. Second, we propose predicted performance, a
                 technique for using the results from the self-scaling
                 evaluation to estimate quickly the performance for
                 workloads that have not been measured. We show that
                 this technique yields reasonably accurate performance
                 estimates and argue that this method gives a far more
                 accurate comparative performance evaluation than
                 traditional single-point benchmarks. We apply our new
                 evaluation technique by measuring a SPARCstation 1+
                 with one SCSI disk, an HP 730 with one SCSI-II disk, a
                 DECstation 5000/200 running the Sprite LFS operating
                 system with a three-disk disk array, a Convex C240
                 minisupercomputer with a four-disk disk array, and a
                 Solbourne 5E/905 fileserver with a two-disk disk
                 array.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "performance",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Measurements. {\bf K.6.2} Computing Milieux, MANAGEMENT
                 OF COMPUTING AND INFORMATION SYSTEMS, Installation
                 Management, Benchmarks. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS. {\bf D.2.8}
                 Software, SOFTWARE ENGINEERING, Metrics, Performance
                 measures. {\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Input/output.",
}

@Article{Reiter:1994:SAF,
  author =       "Michael K. Reiter and Kenneth P. Birman and Robbert
                 van Renesse",
  title =        "A Security Architecture for Fault-Tolerant Systems",
  journal =      j-TOCS,
  volume =       "12",
  number =       "4",
  pages =        "340--371",
  month =        nov,
  year =         "1994",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p340-reiter/",
  abstract =     "Process groups are a common abstraction for
                 fault-tolerant computing in distributed systems. We
                 present a security architecture that extends the
                 process group into a security abstraction. Integral
                 parts of this architecture are services that securely
                 and fault tolerantly support cryptographic key
                 distribution. Using replication only when necessary,
                 and introducing novel replication techniques when it
                 was necessary, we have constructed these services both
                 to be easily defensible against attack and to permit
                 key distribution despite the transient unavailability
                 of a substantial number of servers. We detail the
                 design and implementation of these services and the
                 secure process group abstraction they support. We also
                 give preliminary performance figures for some common
                 group operations.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "reliability; security",
  subject =      "{\bf C.2.0} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, General, Security and
                 protection (e.g., firewalls). {\bf C.2.4} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Fault-tolerance. {\bf D.4.6}
                 Software, OPERATING SYSTEMS, Security and Protection,
                 Authentication. {\bf D.4.6} Software, OPERATING
                 SYSTEMS, Security and Protection, Cryptographic
                 controls. {\bf K.6.5} Computing Milieux, MANAGEMENT OF
                 COMPUTING AND INFORMATION SYSTEMS, Security and
                 Protection, Authentication. {\bf E.3} Data, DATA
                 ENCRYPTION.",
}

@Article{Bates:1995:DHD,
  author =       "Peter C. Bates",
  title =        "Debugging Heterogeneous Distributed Systems Using
                 Event-Based Models of Behavior",
  journal =      j-TOCS,
  volume =       "13",
  number =       "1",
  pages =        "1--31",
  month =        feb,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-1/p1-bates/",
  abstract =     "We describe a high-level debugging approach,
                 Event-Based Behavioral Abstraction (EBBA), in which
                 debugging is treated as a process of creating models of
                 expected program behaviors and comparing these to the
                 actual behaviors exhibited by the program. The use of
                 EBBA techniques can enhance debugging-tool
                 transparency, reduce latency and uncertainty for
                 fundamental debugging activities, and accommodate
                 diverse, heterogeneous architectures. Using events and
                 behavior models as a basic mechanism provides a uniform
                 view of heterogeneous systems and enables analysis to
                 be performed in well-defined ways. Their use also
                 enables EBBA users to extend and reuse knowledge gained
                 in solving previous problems to new situations. We
                 describe our behavior-modeling algorithm that matches
                 actual behavior to models and automates many behavior
                 analysis steps. The algorithm matches behavior in as
                 many ways as possible and resolves these to return the
                 best match to the user. It deals readily with partial
                 behavior matches and incomplete information. In
                 particular, we describe a tool set we have built. The
                 tool set has been used to investigate the behavior of a
                 wide range of programs. The tools are modular and can
                 be distributed readily throughout a system.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; reliability",
  subject =      "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
                 and Debugging, Debugging aids. {\bf C.2.3} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Network Operations, Network monitoring. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, Distributed
                 applications. {\bf D.2.2} Software, SOFTWARE
                 ENGINEERING, Design Tools and Techniques, Programmer
                 workbench**. {\bf D.2.5} Software, SOFTWARE
                 ENGINEERING, Testing and Debugging, Monitors. {\bf
                 D.2.5} Software, SOFTWARE ENGINEERING, Testing and
                 Debugging, Tracing.",
}

@Article{Sugumar:1995:SAC,
  author =       "Rabin A. Sugumar and Santosh G. Abraham",
  title =        "Set-Associative Cache Simulation Using Generalized
                 Binomial Trees",
  journal =      j-TOCS,
  volume =       "13",
  number =       "1",
  pages =        "32--56",
  month =        feb,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-1/p32-sugumar/",
  abstract =     "Set-associative caches are widely used in CPU memory
                 hierarchies, I/O subsystems, and file systems to reduce
                 average access times. This article proposes an
                 efficient simulation technique for simulating a group
                 of set-associative caches in a single pass through the
                 address trace, where all caches have the same line size
                 but varying associativities and varying number of sets.
                 The article also introduces a generalization of the
                 ordinary binomial tree and presents a representation of
                 caches in this class using the Generalized Binomial
                 Tree (gbt). The tree representation permits efficient
                 search and update of the caches. Theoretically, the new
                 algorithm, GBF\_LS, based on the gbt structure, always
                 takes fewer comparisons than the two earlier algorithms
                 for the same class of caches: all-associativity and
                 generalized forest simulation. Experimentally, the new
                 algorithm shows performance gains in the range of 1.2
                 to 3.8 over the earlier algorithms on address traces of
                 the SPEC benchmarks. A related algorithm for simulating
                 multiple alternative direct-mapped caches with fixed
                 cache size, but varying line size, is also presented.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; measurement; performance",
  subject =      "{\bf B.3.3} Hardware, MEMORY STRUCTURES, Performance
                 Analysis and Design Aids**, Simulation**. {\bf E.1}
                 Data, DATA STRUCTURES, Trees. {\bf I.6.8} Computing
                 Methodologies, SIMULATION AND MODELING, Types of
                 Simulation. {\bf B.3.2} Hardware, MEMORY STRUCTURES,
                 Design Styles, Cache memories.",
}

@Article{Tullsen:1995:ECP,
  author =       "Dean M. Tullsen and Susan J. Eggers",
  title =        "Effective Cache Prefetching on Bus-Based
                 Multiprocessors",
  journal =      j-TOCS,
  volume =       "13",
  number =       "1",
  pages =        "57--88",
  month =        feb,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-1/p57-tullsen/",
  abstract =     "Compiler-directed cache prefetching has the potential
                 to hide much of the high memory latency seen by current
                 and future high-performance processors. However,
                 prefetching is not without costs, particularly on a
                 shared-memory multiprocessor. Prefetching can
                 negatively affect bus utilization, overall cache miss
                 rates, memory latencies and data sharing. We simulate
                 the effects of a compiler-directed prefetching
                 algorithm, running on a range of bus-based
                 multiprocessors. We show that, despite a high memory
                 latency, this architecture does not necessarily support
                 prefetching well, in some cases actually causing
                 performance degradations. We pinpoint several problems
                 with prefetching on a shared-memory architecture
                 (additional conflict misses, no reduction in the
                 data-sharing traffic and associated latencies, a
                 multiprocessor's greater sensitivity to memory
                 utilization and the sensitivity of the cache hit rate
                 to prefetch distance) and measure their effect on
                 performance. We then solve those problems through
                 architectural techniques and heuristics for prefetching
                 that could be easily incorporated into a compiler: (1)
                 victim caching, which eliminates most of the cache
                 conflict misses caused by prefetching in a
                 direct-mapped cache, (2) special prefetch algorithms
                 for shared data, which significantly improve the
                 ability of our basic prefetching algorithm to prefetch
                 individual misses, and (3) compiler-based shared-data
                 restructuring, which eliminates many of the
                 invalidation misses the basic prefetching algorithm
                 does not predict. The combined effect of these
                 improvements is to make prefetching effective over a
                 much wider range of memory architectures.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Shared memory. {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors).",
}

@Article{Akyurek:1995:ABR,
  author =       "Sedat Aky{\"u}rek and Kenneth Salem",
  title =        "Adaptive Block Rearrangement",
  journal =      j-TOCS,
  volume =       "13",
  number =       "2",
  pages =        "89--121",
  month =        may,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-2/p89-akyurek/",
  abstract =     "An adaptive technique for reducing disk seek times is
                 described. The technique copies frequently referenced
                 blocks from their original locations to reserved space
                 near the middle of the disk. Reference frequencies need
                 not be known in advance. Instead, they are estimated by
                 monitoring the stream of arriving requests.
                 Trace-driven simulations show that seek times can be
                 cut substantially by copying only a small number of
                 blocks using this technique. The technique has been
                 implemented by modifying a UNIX device driver. No
                 modifications are required to the file system that uses
                 the driver.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; experimentation; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf H.3.2} Information
                 Systems, INFORMATION STORAGE AND RETRIEVAL, Information
                 Storage. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Modeling and prediction. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Simulation.
                 {\bf D.4.8} Software, OPERATING SYSTEMS, Performance.",
}

@Article{Hosseini-Khayat:1995:SEB,
  author =       "Saied Hosseini-Khayat and Andreas D. Bovopoulos",
  title =        "A Simple and Efficient Bus Management Scheme That
                 Supports Continuous Streams",
  journal =      j-TOCS,
  volume =       "13",
  number =       "2",
  pages =        "122--140",
  month =        may,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-2/p122-hosseini-khayat/",
  abstract =     "An efficient bandwidth management and access
                 arbitration scheme for an I/O bus in a multimedia
                 workstation is presented. It assumes that a multimedia
                 workstation consists of a number of processing modules
                 which are interconnected by a packet bus. The scheme is
                 efficient in the sense that it allows the bus to
                 support both continuous media transfers and regular
                 random transactions in such a way that continuous
                 streams can meet their real-time constraints
                 independently of random traffic, and random traffic is
                 not delayed significantly by continuous traffic except
                 when traffic load is very high. Implementation
                 guidelines are provided to show that the scheme is
                 practical. Finally, the performance of this scheme is
                 compared with alternative solutions through
                 simulation.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "experimentation; performance",
  subject =      "{\bf B.4.3} Hardware, INPUT/OUTPUT AND DATA
                 COMMUNICATIONS, Interconnections (Subsystems),
                 Topology. {\bf B.4.4} Hardware, INPUT/OUTPUT AND DATA
                 COMMUNICATIONS, Performance Analysis and Design Aids**,
                 Simulation**. {\bf H.5.1} Information Systems,
                 INFORMATION INTERFACES AND PRESENTATION, Multimedia
                 Information Systems. {\bf C.0} Computer Systems
                 Organization, GENERAL, System architectures.",
}

@Article{Singh:1995:IHB,
  author =       "Jaswinder Pal Singh and John L. Hennessy and Anoop
                 Gupta",
  title =        "Implications of Hierarchical {$N$}-Body Methods for
                 Multiprocessor Architectures",
  journal =      j-TOCS,
  volume =       "13",
  number =       "2",
  pages =        "141--202",
  month =        may,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-2/p141-singh/",
  abstract =     "To design effective large-scale multiprocessors,
                 designers need to understand the characteristics of the
                 applications that will use the machines. Application
                 characteristics of particular interest include the
                 amount of communication relative to computation, the
                 structure of the communication, and the local cache and
                 memory requirements, as well as how these
                 characteristics scale with larger problems and
                 machines. One important class of applications is based
                 on hierarchical N-body methods, which are used to solve
                 a wide range of scientific and engineering problems
                 efficiently. Important characteristics of these methods
                 include the nonuniform and dynamically changing nature
                 of the domains to which they are applied, and their use
                 of long-range, irregular communication. This article
                 examines the key architectural implications of
                 representative applications that use the two dominant
                 hierarchical N-body methods: the Barnes--Hut Method and
                 the Fast Multipole Method. We first show that
                 exploiting temporal locality on accesses to
                 communicated data is critical to obtaining good
                 performance on these applications and then argue that
                 coherent caches on shared-address-space machines
                 exploit this locality both automatically and very
                 effectively. Next, we examine the implications of
                 scaling the applications to run on larger machines. We
                 use scaling methods that reflect the concerns of the
                 application scientist and find that this leads to
                 different conclusions about how communication traffic
                 and local cache and memory usage scale than scaling
                 based only on data set size. In particular, we show
                 that under the most realistic form of scaling, both the
                 communication-to-computation ratio as well as the
                 working-set size (and hence the ideal cache size per
                 processor) grow slowly as larger problems are run on
                 larger machines. Finally, we examine the effects of
                 using the two dominant abstractions for interprocessor
                 communication: a shared address space and explicit
                 message passing between private address spaces. We show
                 that the lack of an efficiently supported shared
                 address space will substantially increase the
                 programming complexity and performance overheads for
                 these applications.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; measurement;
                 performance",
  subject =      "{\bf C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors). {\bf C.0} Computer Systems
                 Organization, GENERAL, System architectures. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS.
                 {\bf C.5.1} Computer Systems Organization, COMPUTER
                 SYSTEM IMPLEMENTATION, Large and Medium (``Mainframe'')
                 Computers.",
}

@Article{Carter:1995:TRC,
  author =       "John B. Carter and John K. Bennett and Willy
                 Zwaenepoel",
  title =        "Techniques for Reducing Consistency-Related
                 Communication in Distributed Shared-Memory Systems",
  journal =      j-TOCS,
  volume =       "13",
  number =       "3",
  pages =        "205--243",
  month =        aug,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-3/p205-carter/",
  abstract =     "Distributed shared memory (DSM) is an abstraction of
                 shared memory on a distributed-memory machine. Hardware
                 DSM systems support this abstraction at the
                 architecture level; software DSM systems support the
                 abstraction within the runtime system. One of the key
                 problems in building an efficient software DSM system
                 is to reduce the amount of communication needed to keep
                 the distributed memories consistent. In this article we
                 present four techniques for doing so: software release
                 consistency; multiple consistency protocols;
                 write-shared protocols; and an update-with-timeout
                 mechanism. These techniques have been implemented in
                 the Munin DSM system. We compare the performance of
                 seven Munin application programs: first to their
                 performance when implemented using message passing, and
                 then to their performance when running on a
                 conventional software DSM system that does not embody
                 the preceding techniques. On a 16-processor cluster of
                 workstations, Munin's performance is within 5\% of
                 message passing for four out of the seven applications.
                 For the other three, performance is within 29 to 33\%.
                 Detailed analysis of two of these three applications
                 indicates that the addition of a function-shipping
                 capability would bring their performance to within 7\%
                 of the message-passing performance. Compared to a
                 conventional DSM system, Munin achieves performance
                 improvements ranging from a few to several hundred
                 percent, depending on the application.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Distributed memories. {\bf B.3.2} Hardware,
                 MEMORY STRUCTURES, Design Styles, Cache memories. {\bf
                 C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors), Interconnection architectures. {\bf
                 D.4.4} Software, OPERATING SYSTEMS, Communications
                 Management, Network communication. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Distributed systems. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf B.3.2}
                 Hardware, MEMORY STRUCTURES, Design Styles, Shared
                 memory. {\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Virtual memory. {\bf D.4.2} Software, OPERATING
                 SYSTEMS, Storage Management, Virtual memory.",
}

@Article{Diwan:1995:MSP,
  author =       "Amer Diwan and David Tarditi and Eliot Moss",
  title =        "Memory System Performance of Programs with Intensive
                 Heap Allocation",
  journal =      j-TOCS,
  volume =       "13",
  number =       "3",
  pages =        "244--273",
  month =        aug,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-3/p244-diwan/",
  abstract =     "Heap allocation with copying garbage collection is a
                 general storage management technique for programming
                 languages. It is believed to have poor memory system
                 performance. To investigate this, we conducted an
                 in-depth study of the memory system performance of heap
                 allocation for memory systems found on many machines.
                 We studied the performance of mostly functional
                 Standard ML programs which made heavy use of heap
                 allocation. We found that most machines support heap
                 allocation poorly. However, with the appropriate memory
                 system organization, heap allocation can have good
                 performance. The memory system property crucial for
                 achieving good performance was the ability to allocate
                 and initialize a new object into the cache without a
                 penalty. This can be achieved by having subblock by
                 placement with a subblock size of one word with a
                 write-allocate policy, along with fast page-mode writes
                 or a write buffer. For caches with subblock placement,
                 the data cache overhead was under 9\% for a 64K or
                 larger data cache; without subblock placement the
                 overhead was often higher than 50\%.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "experimentation; languages; measurement; performance",
  subject =      "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
                 Constructs and Features, Dynamic storage management.
                 {\bf B.3.2} Hardware, MEMORY STRUCTURES, Design Styles,
                 Associative memories. {\bf B.3.3} Hardware, MEMORY
                 STRUCTURES, Performance Analysis and Design Aids**,
                 Simulation**. {\bf D.1.1} Software, PROGRAMMING
                 TECHNIQUES, Applicative (Functional) Programming. {\bf
                 D.3.2} Software, PROGRAMMING LANGUAGES, Language
                 Classifications. {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Cache memories. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS.",
}

@Article{Hartman:1995:ZSN,
  author =       "John H. Hartman and John K. Ousterhout",
  title =        "The {Zebra} Striped Network File System",
  journal =      j-TOCS,
  volume =       "13",
  number =       "3",
  pages =        "274--310",
  month =        aug,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-3/p274-hartman/",
  abstract =     "Zebra is a network file system that increases
                 throughput by striping the file data across multiple
                 servers. Rather than striping each file separately,
                 Zebra forms all the new data from each client into a
                 single stream, which it then stripes using an approach
                 similar to a log-structured file system. This provides
                 high performance for writes of small files as well as
                 for reads and writes of large files. Zebra also writes
                 parity information in each stripe in the style of RAID
                 disk arrays; this increases storage costs slightly, but
                 allows the system to continue operation while a single
                 storage server is unavailable. A prototype
                 implementation of Zebra, built in the Sprite operating
                 system, provides 4-5 times the throughput of the
                 standard Sprite file system or NFS for large files and
                 a 15-300\% improvement for writing small files.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; measurement; performance; reliability",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, File organization. {\bf D.4.2} Software,
                 OPERATING SYSTEMS, Storage Management,
                 Allocation/deallocation strategies. {\bf D.4.3}
                 Software, OPERATING SYSTEMS, File Systems Management,
                 Access methods. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Fault-tolerance. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Distributed systems. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf E.5} Data,
                 FILES, Organization/structure. {\bf D.4.2} Software,
                 OPERATING SYSTEMS, Storage Management, Secondary
                 storage. {\bf D.4.3} Software, OPERATING SYSTEMS, File
                 Systems Management, Distributed file systems.",
}

@Article{Amir:1995:TSR,
  author =       "Y. Amir and L. E. Moser and P. M. Melliar-Smith and D.
                 A. Agarwal and P. Ciarfella",
  title =        "The {Totem} Single-Ring Ordering and Membership
                 Protocol",
  journal =      j-TOCS,
  volume =       "13",
  number =       "4",
  pages =        "311--342",
  month =        nov,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-4/p311-amir/",
  abstract =     "Fault-tolerant distributed systems are becoming more
                 important, but in existing systems, maintaining the
                 consistency of replicated data is quite expensive. The
                 Totem single-ring protocol supports consistent
                 concurrent operations by placing a total order on
                 broadcast messages. This total order is derived from
                 the sequence number in a token that circulates around a
                 logical ring imposed on a set of processors in a
                 broadcast domain. The protocol handles reconfiguration
                 of the system when processors fail and restart or when
                 the network partitions and remerges. Extended virtual
                 synchrony ensures that processors deliver messages and
                 configuration changes to the application in a
                 consistent, systemwide total order. An effective flow
                 control mechanism enables the Totem single-ring
                 protocol to achieve message-ordering rates
                 significantly higher than the best prior total-ordering
                 protocols.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "performance; reliability",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf C.2.1} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Architecture and Design, Network communications. {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf C.2.5} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Local
                 and Wide-Area Networks, Token rings. {\bf D.4.4}
                 Software, OPERATING SYSTEMS, Communications Management,
                 Network communication. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Fault-tolerance. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Distributed systems.",
}

@Article{Herlihy:1995:SCC,
  author =       "Maurice Herlihy and Beng-Hong Lim and Nir Shavit",
  title =        "Scalable Concurrent Counting",
  journal =      j-TOCS,
  volume =       "13",
  number =       "4",
  pages =        "343--364",
  month =        nov,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-4/p343-herlihy/",
  abstract =     "The notion of counting is central to a number of basic
                 multiprocessor coordination problems, such as dynamic
                 load balancing, barrier synchronization, and concurrent
                 data structure design. We investigate the scalability
                 of a variety of counting techniques for large-scale
                 multiprocessors. We compare counting techniques based
                 on: (1) spin locks, (2) message passing, (3)
                 distributed queues, (4) software combining trees, and
                 (5) counting networks. Our comparison is based on a
                 series of simple benchmarks on a simulated 64-processor
                 Alewife machine, a distributed-memory multiprocessor
                 currently under development at MIT. Although locking
                 techniques are known to perform well on small-scale,
                 bus-based multiprocessors, serialization limits
                 performance, and contention can degrade performance.
                 Both counting networks and combining trees outperform
                 the other methods substantially by avoiding
                 serialization and alleviating contention, although
                 combining-tree throughput is more sensitive to
                 variations in load. A comparison of shared-memory and
                 message-passing implementations of counting networks
                 and combining trees shows that message-passing
                 implementations have substantially higher throughput.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors), Multiple-instruction-stream,
                 multiple-data-stream processors (MIMD). {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Concurrency. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management, Scheduling. {\bf B.3.3} Hardware,
                 MEMORY STRUCTURES, Performance Analysis and Design
                 Aids**, Simulation**. {\bf E.1} Data, DATA STRUCTURES,
                 Lists, stacks, and queues. {\bf E.1} Data, DATA
                 STRUCTURES, Trees.",
}

@Article{Mandrioli:1995:GTC,
  author =       "Dino Mandrioli and Sandro Morasca and Angelo
                 Morzenti",
  title =        "Generating Test Cases for Real-Time Systems from Logic
                 Specifications",
  journal =      j-TOCS,
  volume =       "13",
  number =       "4",
  pages =        "365--398",
  month =        nov,
  year =         "1995",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1995-13-4/p365-mandrioli/",
  abstract =     "We address the problem of automated derivation of
                 functional test cases for real-time systems, by
                 introducing techniques for generating test cases from
                 formal specifications written in TRIO, a language that
                 extends classical temporal logic to deal explicitly
                 with time measures. We describe an interactive tool
                 that has been built to implement these techniques,
                 based on interpretation algorithms of the TRIO
                 language. Several heuristic criteria are suggested to
                 reduce drastically the size of the test cases that are
                 generated. Experience in the use of the tool on
                 real-life cases is reported.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; verification",
  subject =      "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
                 and Debugging, Testing tools (e.g., data generators,
                 coverage testing). {\bf B.6.2} Hardware, LOGIC DESIGN,
                 Reliability and Testing**, Test generation**. {\bf
                 B.6.3} Hardware, LOGIC DESIGN, Design Aids,
                 Verification. {\bf C.3} Computer Systems Organization,
                 SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS,
                 Real-time and embedded systems. {\bf D.2.1} Software,
                 SOFTWARE ENGINEERING, Requirements/Specifications,
                 Languages. {\bf D.2.1} Software, SOFTWARE ENGINEERING,
                 Requirements/Specifications, Tools. {\bf B.6.3}
                 Hardware, LOGIC DESIGN, Design Aids, Hardware
                 description languages.",
}

@Article{Chen:1996:MPP,
  author =       "J. Bradley Chen and Yasuhiro Endo and Kee Chan and
                 David Mazi{\`e}res and Antonio Dias and Margo Seltzer
                 and Michael D. Smith",
  title =        "The Measured Performance of Personal Computer
                 Operating Systems",
  journal =      j-TOCS,
  volume =       "14",
  number =       "1",
  pages =        "3--40",
  month =        feb,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-1/p3-chen/",
  abstract =     "This article presents a comparative study of the
                 performance of three operating systems that run on the
                 personal computer architecture derived form the IBM-PC.
                 The operating systems, Windows for Workgroups, Windows
                 NT, and NetBSD (a freely available variant of the UNIX
                 operating system), cover a broad range of system
                 functionality and user requirements, from a
                 single-address-space model to full protection with
                 preemptive multitasking. Our measurements are enable by
                 hardware counters in Intel's Pentium processor that
                 permit measurement of a broad range of processor events
                 including instruction counts and on-chip cache miss
                 counts. We use both microbenchmarks, which expose
                 specific difference between the systems, and
                 application workloads, which provide an indication of
                 expected end-to-end performance. Our microbenchmark
                 results show that accessing system functionality is
                 often more expensive in Windows for Workgroups than in
                 the other two systems due to frequent changes in
                 machine mode and the use of system call hooks. When
                 running native applications, Windows NT is more
                 efficient than Windows, but it incurs overhead similar
                 to that of a microkernel, since its application
                 interface (the Win32 API) is implemented as a
                 user-level server. Overall, system functionality can be
                 accessed most efficiently in NetBSD; we attribute this
                 to its monolithic structure and to the absence of the
                 complications created by hardware
                 backward-compatibility requirements in the other
                 systems. Measurements of application performance show
                 that although the impact of these differences is
                 significant in terms of instruction counts and other
                 hardware events (often a factor of 2 to 7 difference
                 between the systems), overall performance is sometimes
                 determined by the functionality provided by specific
                 subsystems, such as the graphics subsystem or the file
                 system buffer cache.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf D.4.8} Software, OPERATING SYSTEMS, Performance.
                 {\bf C.4} Computer Systems Organization, PERFORMANCE OF
                 SYSTEMS. {\bf D.4.0} Software, OPERATING SYSTEMS,
                 General. {\bf D.4.7} Software, OPERATING SYSTEMS,
                 Organization and Design.",
}

@Article{Anderson:1996:SNF,
  author =       "Thomas E. Anderson and Michael D. Dahlin and Jeanna M.
                 Neefe and David A. Patterson and Drew S. Roselli and
                 Randolph Y. Wang",
  title =        "Serverless Network File Systems",
  journal =      j-TOCS,
  volume =       "14",
  number =       "1",
  pages =        "41--79",
  month =        feb,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-1/p41-anderson/",
  abstract =     "We propose a new paradigm for network file system
                 design: {\em serverless network file systems}. While
                 traditional network file systems rely on a central
                 server machine, a serverless system utilizes
                 workstations cooperating as peers to provide all file
                 system services. Any machine in the system can store,
                 cache, or control any block of data. Our approach uses
                 this location independence, in combination with fast
                 local area networks, to provide better performance and
                 scalability than traditional file systems. Furthermore,
                 because any machine in the system can assume the
                 responsibilities of a failed component, our serverless
                 design also provides high availability via redundant
                 data storage. To demonstrate our approach, we have
                 implemented a prototype serverless network file system
                 called xFS. Preliminary performance measurements
                 suggest that our architecture achieves its goal of
                 scalability. For instance, in a 32-node xFS system with
                 32 active clients, each client receives nearly as much
                 read or write throughput as it would see if it were the
                 only active client.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; measurement; performance;
                 reliability",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Access methods. {\bf D.4.2} Software,
                 OPERATING SYSTEMS, Storage Management,
                 Allocation/deallocation strategies. {\bf D.4.5}
                 Software, OPERATING SYSTEMS, Reliability,
                 Checkpoint/restart. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf E.5} Data,
                 FILES, Organization/structure. {\bf H.3.2} Information
                 Systems, INFORMATION STORAGE AND RETRIEVAL, Information
                 Storage, File organization. {\bf D.4.2} Software,
                 OPERATING SYSTEMS, Storage Management, Secondary
                 storage. {\bf D.4.3} Software, OPERATING SYSTEMS, File
                 Systems Management, Directory structures. {\bf D.4.3}
                 Software, OPERATING SYSTEMS, File Systems Management,
                 Distributed file systems. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, File
                 organization. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Simulation. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, Network operating
                 systems.",
}

@Article{Bressoud:1996:HBF,
  author =       "Thomas C. Bressoud and Fred B. Schneider",
  title =        "Hypervisor-Based Fault Tolerance",
  journal =      j-TOCS,
  volume =       "14",
  number =       "1",
  pages =        "80--107",
  month =        feb,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-1/p80-bressoud/",
  abstract =     "Protocols to implement a fault-tolerant computing
                 system are described. These protocols augment the
                 hypervisor of a virtual-machine manager and coordinate
                 a primary virtual machine with its backup. No
                 modifications to the hardware, operating system, or
                 application programs are required. A prototype system
                 was constructed for HP's PA-RISC instruction-set
                 architecture. Even though the prototype was not
                 carefully tuned, it ran programs about a factor of 2
                 slower than a bare machine would.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; reliability",
  subject =      "{\bf D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Network operating systems. {\bf
                 D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Checkpoint/restart.",
}

@Article{Wilkes:1996:HAH,
  author =       "John Wilkes and Richard Golding and Carl Staelin and
                 Tim Sullivan",
  title =        "The {HP AutoRAID} Hierarchical Storage System",
  journal =      j-TOCS,
  volume =       "14",
  number =       "1",
  pages =        "108--136",
  month =        feb,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-1/p108-wilkes/",
  abstract =     "Configuring redundant disk arrays is a black art. To
                 configure an array properly, a system administrator
                 must understand the details of both the array and the
                 workload it will support. Incorrect understanding of
                 either, or changes in the workload over time, can lead
                 to poor performance. We present a solution to this
                 problem: a two-level storage hierarchy implemented
                 inside a single disk-array controller. In the upper
                 level of this hierarchy, two copies of active data are
                 stored to provide full redundancy and excellent
                 performance. In the lower level, RAID 5 parity
                 protection is used to provide excellent storage cost
                 for inactive data, at somewhat lower performance. The
                 technology we describe in this article, know as HP
                 AutoRAID, automatically and transparently manages
                 migration of data blocks between these two levels as
                 access patterns change. The result is a fully redundant
                 storage system that is extremely easy to use, is
                 suitable for a wide variety of workloads, is largely
                 insensitive to dynamic workload changes, and performs
                 much better than disk arrays with comparable numbers of
                 spindles and much larger amounts of front-end RAM
                 cache. Because the implementation of the HP AutoRAID
                 technology is almost entirely in software, the
                 additional hardware cost for these benefits is very
                 small. We describe the HP AutoRAID technology in
                 detail, provide performance data for an embodiment of
                 it in a storage array, and summarize the results of
                 simulation studies used to choose algorithms
                 implemented in the array.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance; reliability",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Secondary storage. {\bf B.4.2} Hardware,
                 INPUT/OUTPUT AND DATA COMMUNICATIONS, Input/Output
                 Devices, Channels and controllers. {\bf B.4.5}
                 Hardware, INPUT/OUTPUT AND DATA COMMUNICATIONS,
                 Reliability, Testing, and Fault-Tolerance**, Redundant
                 design**. {\bf B.3.2} Hardware, MEMORY STRUCTURES,
                 Design Styles, Mass storage.",
}

@Article{Grimshaw:1996:PRT,
  author =       "Andrew S. Grimshaw and Jon B. Weissman and W. Timothy
                 Strayer",
  title =        "Portable Run-Time Support for Dynamic Object-Oriented
                 Parallel Processing",
  journal =      j-TOCS,
  volume =       "14",
  number =       "2",
  pages =        "139--170",
  month =        may,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-2/p139-grimshaw/",
  abstract =     "Mentat is an object-oriented parallel processing
                 system designed to simplify the task of writing
                 portable parallel programs for parallel machines and
                 workstation networks. The Mentat compiler and run-time
                 system work together to automatically manage the
                 communication and synchronization between objects. The
                 run-time system marshals member function arguments,
                 schedules objects on processors, and dynamically
                 constructs and executes large-grain data dependence
                 graphs. In this article we present the Mentat run-time
                 system. We focus on three aspects---the software
                 architecture, including the interface to the compiler
                 and the structure and interaction of the principle
                 components of the run-time system; the run-time
                 overhead on a component-by-component basis for two
                 platforms, a Sun SPARCstation 2 and an Intel Paragon;
                 and an analysis of the minimum granularity required for
                 application programs to overcome the run-time
                 overhead.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "languages; performance",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Run-time environments. {\bf D.1.3}
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming. {\bf D.1.5}
                 Software, PROGRAMMING TECHNIQUES, Object-oriented
                 Programming. {\bf D.3.2} Software, PROGRAMMING
                 LANGUAGES, Language Classifications, Concurrent,
                 distributed, and parallel languages. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Object-oriented languages.",
}

@Article{Hardy:1996:CIE,
  author =       "Darren R. Hardy and Michael F. Schwartz",
  title =        "Customized Information Extraction as a Basis for
                 Resource Discovery",
  journal =      j-TOCS,
  volume =       "14",
  number =       "2",
  pages =        "171--199",
  month =        may,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-2/p171-hardy/",
  abstract =     "Indexing file contents is a powerful means of helping
                 users locate documents, software, and other types of
                 data among large repositories. In environments that
                 contain many different types of data, content indexing
                 requires type-specific processing to extract
                 information effectively. We present a model for
                 type-specific, user-customizable information
                 extraction, and a system implementation called {\em
                 Essence}. This software structure allows users to
                 associate specialized extraction methods with ordinary
                 files, providing the illusion of an object-oriented
                 file system that encapsulates indexing methods within
                 files. By exploiting the semantics of common file
                 types, Essence generates compact yet representative
                 file summaries that can be used to improve both
                 browsing and indexing in resource discovery systems.
                 Essence can extract information from most of the types
                 of files found in common file systems, including files
                 with nested structure (such as compressed ``tar''
                 files). Essence interoperates with a number of
                 different search/index systems (such as WAIS and
                 Glimpse), as part of the Harvest system.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement",
  subject =      "{\bf H.3.1} Information Systems, INFORMATION STORAGE
                 AND RETRIEVAL, Content Analysis and Indexing. {\bf
                 H.3.4} Information Systems, INFORMATION STORAGE AND
                 RETRIEVAL, Systems and Software, Information networks.
                 {\bf E.5} Data, FILES, Organization/structure. {\bf
                 H.5.2} Information Systems, INFORMATION INTERFACES AND
                 PRESENTATION, User Interfaces.",
}

@Article{Spasojevic:1996:ESW,
  author =       "Mirjana Spasojevic and M. Satyanarayanan",
  title =        "An Empirical Study of a Wide-Area Distributed File
                 System",
  journal =      j-TOCS,
  volume =       "14",
  number =       "2",
  pages =        "200--222",
  month =        may,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-2/p200-spasojevic/",
  abstract =     "The evolution of the Andrew File System (AFS) into a
                 wide-area distributed file system has encouraged
                 collaboration and information dissemination on a much
                 broader scale than ever before. We examine AFS as a
                 provider of wide-area file services to over 100
                 organizations around the world. We discuss usage
                 characteristics of AFS derived from empirical
                 measurements of the system. Our observations indicate
                 that AFS provides robust and efficient data access in
                 its current configuration, thus confirming its
                 viability as a design point for wide-area distributed
                 file systems.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Distributed file systems. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance,
                 Measurements.",
}

@Article{Corbett:1996:VPF,
  author =       "Peter F. Corbett and Dror G. Feitelson",
  title =        "The {Vesta} Parallel File System",
  journal =      j-TOCS,
  volume =       "14",
  number =       "3",
  pages =        "225--264",
  month =        aug,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-3/p225-corbett/",
  abstract =     "The Vesta parallel file system is designed to provide
                 parallel file access to application programs running on
                 multicomputers with parallel I/O subsystems. Vesta uses
                 a new abstraction of files: a file is not a sequence of
                 bytes, but rather it can be partitioned into multiple
                 disjoint sequences that are accessed in parallel. The
                 partitioning---which can also be changed
                 dynamically---reduces the need for synchronization and
                 coordination during the access. Some control over the
                 layout of data is also provided, so the layout can be
                 matched with the anticipated access patterns. The
                 system is fully implemented and forms the basis for the
                 AIX Parallel I/O File System on the IBM SP2. The
                 implementation does not compromise scalability or
                 parallelism. In fact, all data accesses are done
                 directly to the I/O node that contains the requested
                 data, without any indirection or access to shared
                 metadata. Disk mapping and caching functions are
                 confined to each I/O node, so there is no need to keep
                 data coherent across nodes. Performance measurements
                 shown good scalability with increased resources.
                 Moreover, different access patterns are show to achieve
                 similar performance.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Distributed file systems. {\bf D.4.4}
                 Software, OPERATING SYSTEMS, Communications Management,
                 Input/output. {\bf E.5} Data, FILES,
                 Organization/structure. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Parallel programming. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency. {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors),
                 Parallel processors**.",
}

@Article{Cristian:1996:FTA,
  author =       "Flaviu Cristian and Bob Dancey and Jon Dehn",
  title =        "Fault-tolerance in Air Traffic Control Systems",
  journal =      j-TOCS,
  volume =       "14",
  number =       "3",
  pages =        "265--286",
  month =        aug,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-3/p265-cristian/",
  abstract =     "The distributed real-time system services developed by
                 Lockheed Martin's Air Traffic Management group serve
                 the infrastructure for a number of air traffic control
                 systems. Either completed development or under
                 development are the US Federal Aviation
                 Administration's Display System Replacement (DSR)
                 system, the UK Civil Aviation Authority's New Enroute
                 Center (NERC) system, and the Republic of China's Air
                 Traffic Control Automated System (ATCAS). These systems
                 are intended to replace present en route systems over
                 the next decade. High availability of air traffic
                 control services is an essential requirement of these
                 systems. This article discusses the general approach to
                 fault-tolerance adopted in this infrastructure, by
                 reviewing some of the questions which were asked during
                 the system design, various alternative solutions
                 considered, and the reasons for the design choices
                 made. The aspects of this infrastructure chosen for the
                 individual ATC systems mentioned above, along with the
                 status of those systems, are presented in the Section
                 11 of the article.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; reliability",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Reliability, availability, and
                 serviceability. {\bf D.2.5} Software, SOFTWARE
                 ENGINEERING, Testing and Debugging, Error handling and
                 recovery. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance. {\bf J.7} Computer
                 Applications, COMPUTERS IN OTHER SYSTEMS, Real time.
                 {\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Real-time systems and embedded systems.",
}

@Article{Devarakonda:1996:RCF,
  author =       "Murthy Devarakonda and Bill Kish and Ajay Mohindra",
  title =        "Recovery in the {Calypso} File System",
  journal =      j-TOCS,
  volume =       "14",
  number =       "3",
  pages =        "287--310",
  month =        aug,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-3/p287-devarakonda/",
  abstract =     "This article presents the deign and implementation of
                 the recovery scheme in Calypso. Calypso is a
                 cluster-optimized, distributed file system for UNIX
                 clusters. As in Sprite and AFS, Calypso servers are
                 stateful and scale well to a large number of clients.
                 The recovery scheme in Calypso is nondisruptive,
                 meaning that open files remain open, client modified
                 data are saved, and in-flight operations are properly
                 handled across server recover. The scheme uses
                 distributed state amount the clients to reconstruct the
                 server state on a backup node if disks are multiported
                 or on the rebooted server node. It guarantees data
                 consistency during recovery and provides congestion
                 control. Measurements show that the state
                 reconstruction can be quite fast: for example, in a
                 32-node cluster, when an average node contains state
                 for about 420 files, the reconstruction time is about
                 3.3 seconds. However, the time to update a file system
                 after a failure can be a major factor in the overall
                 recovery time, even when using journaling techniques.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; management; measurement; reliability",
  subject =      "{\bf D.4.5} Software, OPERATING SYSTEMS, Reliability,
                 Fault-tolerance. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS. {\bf D.4.3}
                 Software, OPERATING SYSTEMS, File Systems Management,
                 Distributed file systems. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems. {\bf E.5} Data, FILES, Backup/recovery.",
}

@Article{Cao:1996:IPI,
  author =       "Pei Cao and Edward W. Felten and Anna R. Karlin and
                 Kai Li",
  title =        "Implementation and Performance of Integrated
                 Application-Controlled File Caching, Prefetching, and
                 Disk Scheduling",
  journal =      j-TOCS,
  volume =       "14",
  number =       "4",
  pages =        "311--343",
  month =        nov,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-4/p311-cao/",
  abstract =     "As the performance gap between disks and
                 microprocessors continues to increase, effective
                 utilization of the file cache becomes increasingly
                 important. Application-controlled file caching and
                 prefetching can apply application-specific knowledge to
                 improve file cache management. However, supporting
                 application-controlled file caching and prefetching is
                 nontrivial because caching and prefetching need to be
                 integrated carefully, and the kernel needs to allocate
                 cache blocks among processes appropriately. This
                 article presents the design, implementation, and
                 performance of a file system that integrates
                 application-controlled caching, prefetching, and disk
                 scheduling. We use a two-level cache management
                 strategy. The kernel uses the LRU-SP
                 (Least-Recently-Used with Swapping and Placeholders)
                 policy to allocate blocks to processes, and each
                 process integrates application-specific caching and
                 prefetching based on the {\em controlled-aggressive\/}
                 policy, an algorithm previously shown in a theoretical
                 sense to be nearly optimal. Each process also improves
                 its disk access latency by submitting its prefetches in
                 batches so that the requests can be scheduled to
                 optimize disk access performance. Our measurements show
                 that this combination of techniques greatly improves
                 the performance of the file system. We measured that
                 the running time is reduced by 3\% to 49\% (average
                 26\%) for single-process workloads and by 5\% to 76\%
                 (average 32\%) for multiprocess workloads.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; experimentation; measurement;
                 performance",
  subject =      "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
                 Management, Secondary storage. {\bf C.4} Computer
                 Systems Organization, PERFORMANCE OF SYSTEMS, Design
                 studies. {\bf D.4.2} Software, OPERATING SYSTEMS,
                 Storage Management, Storage hierarchies. {\bf D.4.3}
                 Software, OPERATING SYSTEMS, File Systems Management,
                 Access methods. {\bf D.4.8} Software, OPERATING
                 SYSTEMS, Performance, Measurements. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance, Modeling and
                 prediction. {\bf E.5} Data, FILES, Optimization**.",
}

@Article{Saavedra:1996:ABC,
  author =       "Rafael H. Saavedra and Alan J. Smith",
  title =        "Analysis of Benchmark Characteristics and Benchmark
                 Performance Prediction",
  journal =      j-TOCS,
  volume =       "14",
  number =       "4",
  pages =        "344--384",
  month =        nov,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-4/p344-saavedra/",
  abstract =     "Standard benchmarking provides to run-times for given
                 programs on given machines, but fails to provide
                 insight as to why those results were obtained (either
                 in terms of machine or program characteristics) and
                 fails to provide run-times for that program on some
                 other machine, or some other programs on that machine.
                 We have developed a machine-independent model of
                 program execution to characterize both machine
                 performance and program execution. By merging these
                 machine and program characterizations, we can estimate
                 execution time for arbitrary machine/program
                 combinations. Our technique allows us to identify those
                 operations, either on the machine or in the programs,
                 which dominate the benchmark results. This information
                 helps designers in improving the performance of future
                 machines and users in tuning their applications to
                 better utilize the performance of existing machines.
                 Here we apply our methodology to characterize
                 benchmarks and predict their execution times. We
                 present extensive run-time statistics for a large set
                 of benchmarks including the SPEC and Perfect Club
                 suites. We show how these statistics can be used to
                 identify important shortcoming in the programs. In
                 addition, we give execution time estimates for a large
                 sample of programs and machines and compare these
                 against benchmark results. Finally, we develop a metric
                 for program similarity that makes it possible to
                 classify benchmarks with respect to a large set of
                 characteristics.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Measurement techniques. {\bf C.4} Computer
                 Systems Organization, PERFORMANCE OF SYSTEMS, Modeling
                 techniques. {\bf C.4} Computer Systems Organization,
                 PERFORMANCE OF SYSTEMS, Performance attributes. {\bf
                 D.2.8} Software, SOFTWARE ENGINEERING, Metrics,
                 Performance measures. {\bf I.6.4} Computing
                 Methodologies, SIMULATION AND MODELING, Model
                 Validation and Analysis.",
}

@Article{Shavit:1996:DT,
  author =       "Nir Shavit and Asaph Zemach",
  title =        "Diffracting Trees",
  journal =      j-TOCS,
  volume =       "14",
  number =       "4",
  pages =        "385--428",
  month =        nov,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-4/p385-shavit/",
  abstract =     "Shared counters are among the most basic coordination
                 structures in multiprocessor computation, with
                 applications ranging from barrier synchronization to
                 concurrent-data-structure design. This article
                 introduces diffracting trees, novel data structures for
                 share counting and load balancing in a
                 distributed/parallel environment. Empirical evidence,
                 collected on a simulated distributed shared-memory
                 machine and several simulated message-passing
                 architectures, shows that diffracting trees scale
                 better and are more robust than both combining trees
                 and counting networks, currently the most effective
                 known methods for implementing concurrent counters in
                 software. The use of a randomized coordination method
                 together with a combinatorial data structure overcomes
                 the resiliency drawbacks of combining trees. Our
                 simulations show that to handle the same load,
                 diffracting trees and counting networks should have a
                 similar width {\em w}, yet the depth of a diffracting
                 tree is {\em O\/}(log {\em w\/}), whereas counting
                 networks have depth {\em O\/}(log2 {\em w\/}).
                 Diffracting trees have already been used to implement
                 highly efficient producer/consumer queues, and we
                 believe diffraction will prove to be an effective
                 alternative paradigm to combining and queue-locking in
                 the design of many concurrent data structures.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf E.1} Data, DATA STRUCTURES. {\bf C.1.2} Computer
                 Systems Organization, PROCESSOR ARCHITECTURES, Multiple
                 Data Stream Architectures (Multiprocessors). {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems.
                 {\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design, Distributed
                 systems.",
}

@Article{Anonymous:1996:AI,
  author =       "Anonymous",
  title =        "Author Index",
  journal =      j-TOCS,
  volume =       "14",
  number =       "4",
  pages =        "429--430",
  month =        nov,
  year =         "1996",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1996-14-4/p429-author_index/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  subject =      "{\bf A.2} General Literature, REFERENCE.",
}

@Article{Birman:1997:EEP,
  author =       "Kenneth P. Birman",
  title =        "Editorial: Electronic Publication of {TOCS}",
  journal =      j-TOCS,
  volume =       "15",
  number =       "1",
  pages =        "1--1",
  month =        feb,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-1/p1-birman/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kontothanassis:1997:SCS,
  author =       "Leonidas I. Kontothanassis and Robert W. Wisniewski
                 and Michael L. Scott",
  title =        "Scheduler-Conscious Synchronization",
  journal =      j-TOCS,
  volume =       "15",
  number =       "1",
  pages =        "3--40",
  month =        feb,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-1/p3-kontothanassis/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance; reliability",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Synchronization. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Parallel programming. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking.",
}

@Article{Kotz:1997:DDM,
  author =       "David Kotz",
  title =        "Disk-Directed {I/O} for {MIMD} Multiprocessors",
  journal =      j-TOCS,
  volume =       "15",
  number =       "1",
  pages =        "41--74",
  month =        feb,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-1/p41-kotz/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Access methods. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, File
                 organization. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Simulation. {\bf E.5} Data, FILES.",
}

@Article{Steenkiste:1997:HSN,
  author =       "Peter Steenkiste",
  title =        "A High-Speed Network Interface for Distributed-Memory
                 Systems: Architecture and Applications",
  journal =      j-TOCS,
  volume =       "15",
  number =       "1",
  pages =        "75--109",
  month =        feb,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-1/p75-steenkiste/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "management; performance; reliability",
  subject =      "{\bf D.4.4} Software, OPERATING SYSTEMS,
                 Communications Management, Network communication. {\bf
                 B.4.3} Hardware, INPUT/OUTPUT AND DATA COMMUNICATIONS,
                 Interconnections (Subsystems), Interfaces. {\bf C.0}
                 Computer Systems Organization, GENERAL, System
                 architectures. {\bf C.2.2} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Protocols, Protocol architecture. {\bf D.4.4} Software,
                 OPERATING SYSTEMS, Communications Management,
                 Buffering.",
}

@Article{Anderson:1997:DRA,
  author =       "David P. Anderson",
  title =        "Device Reservation in Audio\slash Video Editing
                 Systems",
  journal =      j-TOCS,
  volume =       "15",
  number =       "2",
  pages =        "111--133",
  month =        may,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-2/p111-anderson/",
  abstract =     "What fraction of disks and other shared devices must
                 be reserved to play an audio/video document without
                 dropouts? In general, this question cannot be answered
                 precisely. For documents with complex and irregular
                 structure, such as those arising in audio/video
                 editing, it is difficult even to give a good estimate.
                 We describe three approaches to this problem. The
                 first, based on long-term average properties of
                 segments, is fast but imprecise: it underreserves in
                 some cases and overreserves in others. The second
                 approach models individual disk and network operations.
                 It is precise but slow. The third approach, a hybrid,
                 is both precise and fast.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; performance",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design, Real-time systems and embedded systems.
                 {\bf D.4.8} Software, OPERATING SYSTEMS, Performance,
                 Modeling and prediction. {\bf H.5.1} Information
                 Systems, INFORMATION INTERFACES AND PRESENTATION,
                 Multimedia Information Systems, Audio input/output.
                 {\bf H.5.1} Information Systems, INFORMATION INTERFACES
                 AND PRESENTATION, Multimedia Information Systems, Video
                 (e.g., tape, disk, DVI).",
}

@Article{Anderson:1997:RTC,
  author =       "James H. Anderson and Srikanth Ramamurthy and Kevin
                 Jeffay",
  title =        "Real-time Computing with Lock-Free Shared Objects",
  journal =      j-TOCS,
  volume =       "15",
  number =       "2",
  pages =        "134--165",
  month =        may,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-2/p134-anderson/",
  abstract =     "This article considers the use of lock-free shared
                 objects within hard real-time systems. As the name
                 suggests, {\em lock-free\/} shared objects are
                 distinguished by the fact that they are accessed
                 without locking. As such, they do not give rise to
                 priority inversions, a key advantage over conventional,
                 lock-based object-sharing approaches. Despite this
                 advantage, it is not immediately apparent that
                 lock-free shared objects can be employed if tasks must
                 adhere to strict timing constraints. In particular,
                 lock-free object implementations permit concurrent
                 operations to interfere with each other, and repeated
                 interferences can cause a given operation to take an
                 arbitrarily long time to complete. The main
                 contribution of this article is to show that such
                 interferences can be bounded by judicious scheduling.
                 This work pertains to periodic, hard real-time tasks
                 that share lock-free objects on a uniprocessor. In the
                 first part of the article, scheduling conditions are
                 derived for such tasks, for both static and dynamic
                 priority schemes. Based on these conditions, it is
                 formally shown that lock-free shared objects often
                 incur less overhead than object implementations based
                 on wait-free algorithms or lock-based schemes. In the
                 last part of the article, this conclusion is validated
                 experimentally through work involving a real-time
                 desktop videoconferencing system.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance; theory",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Concurrency. {\bf C.3} Computer Systems
                 Organization, SPECIAL-PURPOSE AND APPLICATION-BASED
                 SYSTEMS, Real-time and embedded systems. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Mutual exclusion. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Scheduling. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Synchronization. {\bf J.7} Computer Applications,
                 COMPUTERS IN OTHER SYSTEMS, Real time.",
}

@Article{Mahmood:1997:OAM,
  author =       "Ausif Mahmood and Donald J. Lynch and Roger B.
                 Shaffer",
  title =        "Optimally Adaptive, Minimum-Distance, Circuit-Switched
                 Routing in Hypercubes",
  journal =      j-TOCS,
  volume =       "15",
  number =       "2",
  pages =        "166--193",
  month =        may,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-2/p166-mahmood/",
  abstract =     "In circuit-switched routing, the path between a source
                 and its destination is established by incrementally
                 reserving all required links before the data
                 transmission can begin. If the routing algorithm is not
                 carefully designed, deadlocks can occur in reserving
                 these links. Deadlock-free algorithms based on
                 dimension-ordered routing, such as the {\em E-cube},
                 exist. However, {\em E-cube\/} does not provide any
                 flexibility in choosing a path from a source to its
                 destination and can thus result in long latencies under
                 heavy or uneven traffic. Adaptive, minimum-distance
                 routing algorithms, such as the {\em Turn Model\/} and
                 the {\em UP Preference\/} algorithms, have previously
                 been reported. In this article, we present a new class
                 of adaptive, provably deadlock-free, minimum-distance
                 routing algorithms. We prove that the algorithms
                 developed here are optimally adaptive in the sense that
                 any further flexibility in communication will result in
                 deadlock. We show that the {\em Turn Model\/} is
                 actually a member of our new class of algorithms that
                 does not perform as well as other algorithms within the
                 new class. It creates artificial hotspots in routing
                 the traffic and allows fewer total paths. We present an
                 analytical comparison of the flexibility and balance in
                 routing provided by various algorithms and a comparison
                 based on uniform and nonuniform traffic simulations.
                 The {\em Extended UP Preference\/} algorithm developed
                 in this article is shown to have improved performance
                 with respect to existing algorithms. The methodology
                 and the algorithms developed here can be used to
                 develop routing for other schemes such as wormhole
                 routing, and for other recursively defined networks
                 such as {\em k\/}-ary {\em n\/}-cubes.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; theory",
  subject =      "{\bf C.2.1} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Architecture
                 and Design, Network communications. {\bf C.1.2}
                 Computer Systems Organization, PROCESSOR ARCHITECTURES,
                 Multiple Data Stream Architectures (Multiprocessors),
                 Interconnection architectures. {\bf C.2.1} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Network Architecture and Design, Network topology.",
}

@Article{Pfitzmann:1997:SLT,
  author =       "Birgit Pfitzmann and Michael Waidner",
  title =        "Strong Loss Tolerance of Electronic Coin Systems",
  journal =      j-TOCS,
  volume =       "15",
  number =       "2",
  pages =        "194--213",
  month =        may,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-2/p194-pfitzmann/",
  abstract =     "Untraceable electronic cash means prepaid digital
                 payment systems, usually with offline payments, that
                 protect user privacy. Such systems have recently been
                 given considerable attention by both theory and
                 development projects. However, in most current schemes,
                 loss of a user device containing electronic cash
                 implies a loss of money, just as with real cash. In
                 comparison with credit schemes, this is considered a
                 serious shortcoming. This article shows how untraceable
                 electronic cash can be made loss tolerant, i.e., how
                 the monetary value of the lost data can be recovered.
                 Security against fraud and preservation of privacy are
                 ensured; strong loss tolerance means that not even
                 denial of recovery is possible. In particular, systems
                 based on electronic coins are treated. We present
                 general design principles and options and their
                 instantiation in one concrete payment system. The
                 measures are practical.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; reliability; security",
  subject =      "{\bf D.4.6} Software, OPERATING SYSTEMS, Security and
                 Protection, Cryptographic controls. {\bf C.2.4}
                 Computer Systems Organization, COMPUTER-COMMUNICATION
                 NETWORKS, Distributed Systems, Distributed
                 applications. {\bf D.4.5} Software, OPERATING SYSTEMS,
                 Reliability, Fault-tolerance. {\bf H.4.3} Information
                 Systems, INFORMATION SYSTEMS APPLICATIONS,
                 Communications Applications. {\bf K.6.5} Computing
                 Milieux, MANAGEMENT OF COMPUTING AND INFORMATION
                 SYSTEMS, Security and Protection. {\bf K.4.0} Computing
                 Milieux, COMPUTERS AND SOCIETY, General.",
}

@Article{Mogul:1997:ERL,
  author =       "Jeffrey C. Mogul and K. K. Ramakrishnan",
  title =        "Eliminating Receive Livelock in an Interrupt-Driven
                 Kernel",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "217--252",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p217-mogul/",
  abstract =     "Most operating systems use interface interrupts to
                 schedule network tasks. Interrupt-driven systems can
                 provide low overhead and good latency at low offered
                 load, but degrade significantly at higher arrival rates
                 unless care is taken to prevent several pathologies.
                 These are various forms of{\bf receive livelock}, in
                 which the system spends all of its time processing
                 interrupts, to the exclusion of other necessary tasks.
                 Under extreme conditions, no packets are delivered to
                 the user application or the output of the system. To
                 avoid livelock and related problems, an operating
                 system must schedule network interrupt handling as
                 carefully as it schedules process execution. We
                 modified an interrupt-driven networking implementation
                 to do so; this modification eliminates receive livelock
                 without degrading other aspects of system performance.
                 Our modifications include the use of polling when the
                 system is heavily loaded, while retaining the use of
                 interrupts ur.Jer lighter load. We present measurements
                 demonstrating the success of our approach.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management, Scheduling. {\bf D.4.4} Software, OPERATING
                 SYSTEMS, Communications Management, Input/output. {\bf
                 D.4.4} Software, OPERATING SYSTEMS, Communications
                 Management, Network communication. {\bf C.2.0} Computer
                 Systems Organization, COMPUTER-COMMUNICATION NETWORKS,
                 General.",
}

@Article{Harchol-Balter:1997:EPL,
  author =       "Mor Harchol-Balter and Allen B. Downey",
  title =        "Exploiting Process Lifetime Distributions for Dynamic
                 Load Balancing",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "253--285",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p253-harchol-balter/",
  abstract =     "We consider policies for CPU load balancing in
                 networks of workstations. We address the question of
                 whether preemptive migration (migrating active
                 processes) is necessary, or whether remote execution
                 (migrating processes only at the time of birth) is
                 sufficient for load balancing. We show that resolving
                 this issue is strongly tied to understanding the
                 process lifetime distribution. Our measurements
                 indicate that the distribution of lifetimes for a UNIX
                 process is Pareto (heavy-tailed), with a consistent
                 functional form over a variety of workloads. We show
                 how to apply this distribution to derive a preemptive
                 migration policy that requires no hand-tuned
                 parameters. We used a trace-driven simulation to show
                 that our preemptive migration strategy is far more
                 effective than remote execution, even when the memory
                 transfer cost is high.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; measurement; performance",
  subject =      "{\bf C.2.3} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Operations,
                 Network management. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS. {\bf C.5.3}
                 Computer Systems Organization, COMPUTER SYSTEM
                 IMPLEMENTATION, Microcomputers. {\bf G.3} Mathematics
                 of Computing, PROBABILITY AND STATISTICS. {\bf G.m}
                 Mathematics of Computing, MISCELLANEOUS. {\bf I.6.0}
                 Computing Methodologies, SIMULATION AND MODELING,
                 General. {\bf C.2.3} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Operations,
                 Network monitoring.",
}

@Article{Krieger:1997:HPO,
  author =       "Orran Krieger and Michael Stumm",
  title =        "{HFS}: a Performance-Oriented Flexible File System
                 Based on Building-Block Compositions",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "286--321",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p286-krieger/",
  abstract =     "The Hurricane File System (HFS) is designed for
                 (potentially large-scale) shared-memory
                 multiprocessors. Its architecture is based on the
                 principle that, in order to maximize performance for
                 applications with diverse requirements, a file system
                 must support a wide variety of file structures, file
                 system policies, and I/O interfaces. Files in HFS are
                 implemented using simple building blocks composed in
                 potentially complex ways. This approach yields great
                 flexibility, allowing an application to customize the
                 structure and policies of a file to exactly meet its
                 requirements. As an extreme example, HFS allows a
                 file's structure to be optimized for concurrent
                 random-access write-only operations by 10 threads,
                 something no other file system can do. Similarly, the
                 prefetching, locking, and file cache management
                 policies can all be chosen to match an application's
                 access pattern. In contrast, most parallel file systems
                 support a single file structure and a small set of
                 policies. We have implemented HFS as part of the
                 Hurricane operating system running on the Hector
                 shared-memory multiprocessor. We demonstrate that the
                 flexibility of HFS comes with little processing or I/O
                 overhead. We also show that for a number of file access
                 patterns, HFS is able to deliver to the applications
                 the full I/O bandwidth of the disks on our system.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, File organization. {\bf D.4.3} Software,
                 OPERATING SYSTEMS, File Systems Management, Access
                 methods. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf E.5} Data, FILES,
                 Optimization**. {\bf E.5} Data, FILES,
                 Organization/structure.",
}

@Article{Lo:1997:CTL,
  author =       "Jack L. Lo and Joel S. Emer and Henry M. Levy and
                 Rebecca L. Stamm and Dean M. Tullsen",
  title =        "Converting Thread-Level Parallelism to
                 Instruction-Level Parallelism via Simultaneous
                 Multithreading",
  journal =      j-TOCS,
  volume =       "15",
  number =       "3",
  pages =        "322--354",
  month =        aug,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/",
  abstract =     "To achieve high performance, contemporary computer
                 systems rely on two forms of parallelism:
                 instruction-level parallelism (ILP) and thread-level
                 parallelism (TLP). Wide-issue super-scalar processors
                 exploit ILP by executing multiple instructions from a
                 single program in a single cycle. Multiprocessors (MP)
                 exploit TLP by executing different threads in parallel
                 on different processors. Unfortunately, both parallel
                 processing styles statically partition processor
                 resources, thus preventing them from adapting to
                 dynamically changing levels of ILP and TLP in a
                 program. With insufficient TLP, processors in an MP
                 will be idle; with insufficient ILP, multiple-issue
                 hardware on a superscalar is wasted. This article
                 explores parallel processing on an alternative
                 architecture, simultaneous multithreading (SMT), which
                 allows multiple threads to complete for and share all
                 of the processor's resources every cycle. The most
                 compelling reason for running parallel applications on
                 an SMT processor is its ability to use thread-level
                 parallelism and instruction-level parallelism
                 interchangeably. By permitting multiple threads to
                 share the processor's functional units simultaneously,
                 the processor can use both ILP and TLP to accommodate
                 variations in parallelism. When a program has only a
                 single thread, all of the SMT processor's resources can
                 be dedicated to that thread; when more TLP exists, this
                 parallelism can compensate for a lack of per-thread
                 ILP. We examine two alternative on-chip parallel
                 architectures for the next generation of processors. We
                 compare SMT and small-scale, on-chip multiprocessors in
                 their ability to exploit both ILP and TLP. First, we
                 identify the hardware bottlenecks that prevent
                 multiprocessors from effectively exploiting ILP. Then,
                 we show that because of its dynamic resource sharing,
                 SMT avoids these inefficiencies and benefits from being
                 able to run more threads on a single processor. The use
                 of TLP is especially advantageous when per-thread ILP
                 is limited. The ease of adding additional thread
                 contexts on an SMT (relative to adding additional
                 processors on an MP) allows simultaneous multithreading
                 to expose more parallelism, further increasing
                 functional unit utilization and attaining a 52\%
                 average speedup (versus a four-processor, single-chip
                 multiprocessor with comparable execution resources).
                 This study also addresses an often-cited concern
                 regarding the use of thread-level parallelism or
                 multithreading: interference in the memory system and
                 branch prediction hardware. We find the multiple
                 threads cause interthread interference in the caches
                 and place greater demands on the memory system, thus
                 increasing average memory latencies. By exploiting
                 threading-level parallelism, however, SMT hides these
                 additional latencies, so that they only have a small
                 impact on total program performance. We also find that
                 for parallel applications, the additional threads have
                 minimal effects on branch prediction.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf C.1.2} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Multiple Data Stream Architectures
                 (Multiprocessors), Parallel processors**. {\bf C.0}
                 Computer Systems Organization, GENERAL, Instruction set
                 design. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management.",
}

@Article{Levy:1997:GE,
  author =       "Henry M. Levy",
  title =        "Guest Editorial",
  journal =      j-TOCS,
  volume =       "15",
  number =       "4",
  pages =        "355--356",
  month =        nov,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p355-levy/",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Anderson:1997:CPW,
  author =       "Jennifer M. Anderson and Lance M. Berc and Jeffrey
                 Dean and Sanjay Ghemawat and Monika R. Henzinger and
                 Shun-Tak A. Leung and Richard L. Sites and Mark T.
                 Vandevoorde and Carl A. Waldspurger and William E.
                 Weihl",
  title =        "Continuous Profiling: Where Have All the Cycles
                 Gone?",
  journal =      j-TOCS,
  volume =       "15",
  number =       "4",
  pages =        "357--390",
  month =        nov,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p357-anderson/",
  abstract =     "This article describes the Digital Continuous
                 Profiling Infrastructure, a sampling-based profiling
                 system designed to run continuously on production
                 systems. The system supports multiprocessors, works on
                 unmodified executables, and collects profiles for
                 entire systems, including user programs, shared
                 libraries, and the operating system kernel. Samples are
                 collected at a high rate (over 5200 samples/sec. per
                 333MHz processor), yet with low overhead (1-3\%
                 slowdown for most workloads). Analysis tools supplied
                 with the profiling system use the sample data to
                 produce a precise and accurate accounting, down to the
                 level of pipeline stalls incurred by individual
                 instructions, of where time is bring spent. When
                 instructions incur stalls, the tools identify possible
                 reasons, such as cache misses, branch mispredictions,
                 and functional unit contention. The fine-grained
                 instruction-level analysis guides users and automated
                 optimizers to the causes of performance problems and
                 provides important insights for fixing them.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "performance",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Performance attributes. {\bf D.2.2}
                 Software, SOFTWARE ENGINEERING, Design Tools and
                 Techniques. {\bf D.2.6} Software, SOFTWARE ENGINEERING,
                 Programming Environments. {\bf D.4.7} Software,
                 OPERATING SYSTEMS, Organization and Design. {\bf D.4.8}
                 Software, OPERATING SYSTEMS, Performance. {\bf D.4.0}
                 Software, OPERATING SYSTEMS, General.",
}

@Article{Savage:1997:EDD,
  author =       "Stefan Savage and Michael Burrows and Greg Nelson and
                 Patrick Sobalvarro and Thomas Anderson",
  title =        "{Eraser}: a Dynamic Data Race Detector for
                 Multithreaded Programs",
  journal =      j-TOCS,
  volume =       "15",
  number =       "4",
  pages =        "391--411",
  month =        nov,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p391-savage/",
  abstract =     "Multithreaded programming is difficult and error
                 prone. It is easy to make a mistake in synchronization
                 that produces a data race, yet it can be extremely hard
                 to locate this mistake during debugging. This article
                 describes a new tool, called Eraser, for dynamically
                 detecting data races in lock-based multithreaded
                 programs. Eraser uses binary rewriting techniques to
                 monitor every shared-monory reference and verify that
                 consistent locking behavior is observed. We present
                 several case studies, including undergraduate
                 coursework and a multithreaded Web search engine, that
                 demonstrate the effectiveness of this approach.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; experimentation; reliability",
  subject =      "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
                 and Debugging, Monitors. {\bf D.1.3} Software,
                 PROGRAMMING TECHNIQUES, Concurrent Programming,
                 Parallel programming. {\bf D.2.5} Software, SOFTWARE
                 ENGINEERING, Testing and Debugging, Debugging aids.
                 {\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and
                 Debugging, Tracing. {\bf D.4.1} Software, OPERATING
                 SYSTEMS, Process Management, Concurrency. {\bf D.4.1}
                 Software, OPERATING SYSTEMS, Process Management,
                 Deadlocks. {\bf D.4.1} Software, OPERATING SYSTEMS,
                 Process Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Mutual exclusion.",
}

@Article{Bugnion:1997:DRC,
  author =       "Edouard Bugnion and Scott Devine and Kinshuk Govil and
                 Mendel Rosenblum",
  title =        "{Disco}: Running Commodity Operating Systems on
                 Scalable Multiprocessors",
  journal =      j-TOCS,
  volume =       "15",
  number =       "4",
  pages =        "412--447",
  month =        nov,
  year =         "1997",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p412-bugnion/",
  abstract =     "In this article we examine the problem of extending
                 modern operating systems to run efficiently on
                 large-scale shared-memory multiprocessors without a
                 large implementation effort. Our approach brings back
                 an idea popular in the 1970s: virtual machine monitors.
                 We use virtual machines to run multiple commodity
                 operating systems on a scalable multiprocessor. This
                 solution addresses many of the challenges facing the
                 system software for these machines. We demonstrate our
                 approach with a prototype called Disco that runs
                 multiple copies of Silicon Graphics' IRIX operating
                 system on a multiprocessor. Our experience shows that
                 the overheads of the monitor are small and that the
                 approach provides scalability as well as the ability to
                 deal with the nonuniform memory access time of these
                 systems. To reduce the memory overheads associated with
                 running multiple operating systems, virtual machines
                 transparently share major data structures such as the
                 program code and the file system buffer cache. We use
                 the distributed-system support of modern operating
                 systems to export a partial single system image to the
                 users. The overall solution achieves most of the
                 benefits of operating systems customized for scalable
                 multiprocessors, yet it can be achieved with a
                 significantly smaller implementation effort.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf D.4.7} Software, OPERATING SYSTEMS, Organization
                 and Design. {\bf C.1.2} Computer Systems Organization,
                 PROCESSOR ARCHITECTURES, Multiple Data Stream
                 Architectures (Multiprocessors), Parallel
                 processors**.",
}

@Article{Bal:1998:PEO,
  author =       "Henri E. Bal and Raoul Bhoedjang and Rutger Hofman and
                 Ceriel Jacobs and Koen Langendoen and Tim R{\"u}hl and
                 M. Frans Kaashoek",
  title =        "Performance Evaluation of the {Orca} Shared-Object
                 System",
  journal =      j-TOCS,
  volume =       "16",
  number =       "1",
  pages =        "1--40",
  month =        feb,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-1/p1-bal/",
  abstract =     "Orca is a portable, object-based distributed shared
                 memory (DSM) system. This article studies and evaluates
                 the design choices made in the Orca system and compares
                 Orca with other DSMs. The article gives a quantitative
                 analysis of Orca's coherence protocol (based on
                 write-updates with function shipping), the totally
                 ordered group communication protocol, the strategy for
                 object placement, and the all-software, user-space
                 architecture. Performance measurements for 10 parallel
                 applications illustrate the trade-offs made in the
                 design of Orca and show that essentially the right
                 design decisions have been made. A write-update
                 protocol with function shipping is effective for Orca,
                 especially since it is used in combination with
                 techniques that avoid replicating objects that have a
                 low read/write ratio. The overhead of totally ordered
                 group communication on application performance is low.
                 The Orca system is able to make near-optimal decisions
                 for object placement and replication. In addition, the
                 article compares the performance of Orca with that of a
                 page-based DSM (TreadMarks) and another object-based
                 DSM (CRL). It also analyzes the communication overhead
                 of the DSMs for several applications. All performance
                 measurements are done on a 32-node Pentium Pro cluster
                 with Myrinet and Fast Ethernet networks. The results
                 show that Orca programs send fewer messages and less
                 data than the TreadMarks and CRL programs and obtain
                 better speedups.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; languages; performance",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Run-time environments. {\bf D.1.3}
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Distributed programming. {\bf D.1.3}
                 Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Parallel programming. {\bf D.3.2}
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, Concurrent, distributed, and parallel
                 languages. {\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Compilers.",
}

@Article{Derk:1998:RFT,
  author =       "M. D. Derk and L. S. DeBrunner",
  title =        "Reconfiguration for Fault Tolerance Using Graph
                 Grammars",
  journal =      j-TOCS,
  volume =       "16",
  number =       "1",
  pages =        "41--54",
  month =        feb,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-1/p41-derk/",
  abstract =     "Reconfiguration for fault tolerance is a widely
                 studied field, but this work applies graph grammars to
                 this discipline for the first time. Reconfiguration
                 Graph Grammars (RGG) are defined and applied to the
                 definition of processor array reconfiguration
                 algorithms. The nodes of a graph are associated with
                 the processors of a processor array, and the edges are
                 associated with those interprocessor communication
                 lines that are active. The resulting algorithms for
                 dynamic (run-time) reconfiguration are efficient and
                 can be implemented distributively.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; design; reliability; theory",
  subject =      "{\bf C.4} Computer Systems Organization, PERFORMANCE
                 OF SYSTEMS, Reliability, availability, and
                 serviceability. {\bf F.4.2} Theory of Computation,
                 MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Grammars and
                 Other Rewriting Systems. {\bf C.1.2} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Multiple Data
                 Stream Architectures (Multiprocessors),
                 Multiple-instruction-stream, multiple-data-stream
                 processors (MIMD).",
}

@Article{Mowry:1998:TLM,
  author =       "Todd C. Mowry",
  title =        "Tolerating Latency in Multiprocessors through
                 Compiler-Inserted Prefetching",
  journal =      j-TOCS,
  volume =       "16",
  number =       "1",
  pages =        "55--92",
  month =        feb,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-1/p55-mowry/",
  abstract =     "The large latency of memory accesses in large-scale
                 shared-memory multiprocessors is a key obstacle to
                 achieving high processor utilization. {\em
                 Software-controlled prefetching\/} is a technique for
                 tolerating memory latency by explicitly executing
                 instructions to move data close to the processor before
                 the data are actually needed. To minimize the burden on
                 the programmer, compiler support is needed to
                 automatically insert prefetch instructions into the
                 code. A key challenge when inserting prefetches is
                 ensuring that the overheads of prefetching do not
                 outweigh the benefits. While previous studies have
                 demonstrated the effectiveness of hand-inserted
                 prefetching in multiprocessor applications, the benefit
                 of {\em compiler-inserted\/} prefetching in practice
                 has remained an open question. This article proposes
                 and evaluates a new compiler algorithm for inserting
                 prefetches into multiprocessor code. The proposed
                 algorithm attempts to minimize overheads by only
                 issuing prefetches for references that are predicted to
                 suffer cache misses. The algorithm can prefetch both
                 dense-matrix and sparse-matrix codes, thus covering a
                 large fraction of scientific applications. We have
                 implemented our algorithm in the SUIF(Stanford
                 University Intermediate Format) optimizing compiler.
                 The results of our detailed architectural simulations
                 demonstrate that compiler-inserted prefetching can
                 improve the speed of some parallel applications by as
                 much as a factor of two.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
                 Processors, Optimization. {\bf B.3.2} Hardware, MEMORY
                 STRUCTURES, Design Styles, Cache memories. {\bf D.3.4}
                 Software, PROGRAMMING LANGUAGES, Processors,
                 Compilers.",
}

@Article{Agarwal:1998:TMR,
  author =       "D. A. Agarwal and L. E. Moser and P. M. Melliar-Smith
                 and R. K. Budhia",
  title =        "The {Totem} Multiple-Ring Ordering and Topology
                 Maintenance Protocol",
  journal =      j-TOCS,
  volume =       "16",
  number =       "2",
  pages =        "93--132",
  month =        may,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-2/p93-agarwal/",
  abstract =     "The Totem multiple-ring protocol provides reliable
                 totally ordered delivery of messages across multiple
                 local-area networks interconnected by gateways. This
                 consistent message order is maintained in the presence
                 of network partitioning and remerging, and of processor
                 failure and recovery. The protocol provides accurate
                 topology change information as part of the global total
                 order of messages. It addresses the issue of
                 scalability and achieves a latency that increases
                 logarithmically with system size by exploiting process
                 group locality and selective forwarding of messages
                 through the gateways. Pseudocode for the protocol and
                 an evaluation of its performance are given. ---Authors'
                 Abstract",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "algorithms; performance; reliability",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf C.2.1} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS, Network
                 Architecture and Design, Network communications. {\bf
                 C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Fault
                 tolerance.",
}

@Article{Lamport:1998:PTP,
  author =       "Leslie Lamport",
  title =        "The Part-Time Parliament",
  journal =      j-TOCS,
  volume =       "16",
  number =       "2",
  pages =        "133--169",
  month =        may,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-2/p133-lamport/",
  abstract =     "Recent archaeological discoveries on the island of
                 Paxos reveal that the parliament functioned despite the
                 peripatetic propensity of its part-time legislators.
                 The legislators maintained consistent copies of the
                 parliamentary record, despite their frequent forays
                 from the chamber and the forgetfulness of their
                 messengers. The Paxon parliament's protocol provides a
                 new way of implementing the state machine approach to
                 the design of distributed systems.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; reliability",
  subject =      "{\bf C.2.4} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Distributed Systems,
                 Network operating systems. {\bf D.4.5} Software,
                 OPERATING SYSTEMS, Reliability, Fault-tolerance. {\bf
                 J.1} Computer Applications, ADMINISTRATIVE DATA
                 PROCESSING, Government.",
}

@Article{Horowitz:1998:IMO,
  author =       "Mark Horowitz and Margaret Martonoisi and Todd C.
                 Mowry and Michael D. Smith",
  title =        "Informing Memory Operations: Memory Performance
                 Feedback Mechanisms and Their Applications",
  journal =      j-TOCS,
  volume =       "16",
  number =       "2",
  pages =        "170--205",
  month =        may,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-2/p170-horowitz/",
  abstract =     "Memory latency is an important bottleneck in system
                 performance that cannot be adequately solved by
                 hardware alone. Several promising software techniques
                 have been shown to address this problem successfully in
                 specific situations. However, the generality of these
                 software approaches has been limited because current
                 architectures do not provide a fine-grained,
                 low-overhead mechanism for observing and reacting to
                 memory behavior directly. To fill this need, this
                 article proposes a new class of memory operations
                 called {\em informing memory operations}, which
                 essentially consist of a memory operating combined
                 (either implicitly or explicitly) with a conditional
                 branch-and-ink operation that is taken only if the
                 reference suffers a cache miss. This article describes
                 two different implementations of informing memory
                 operations. One is based on a {\em cache-outcome
                 condition code}, and the other is based on {\em
                 low-overhead traps.\/} We find that modern
                 in-order-issue and out-of-order-issue superscalar
                 processors already contain the bulk of the necessary
                 hardware support. We describe how a number of
                 software-based memory optimizations can exploit
                 informing memory operations to enhance performance, and
                 we look at cache coherence with fine-grained access
                 control as a case study. Our performance results
                 demonstrate that the runtime overhead of invoking the
                 informing mechanism on the Alpha 21164 and MIPS R10000
                 processors is generally small enough to provide
                 considerable flexibility to hardware and software
                 designers, and that the cache coherence application has
                 improved performance compared to other current
                 solutions. We believe that the inclusion of informing
                 memory operations in future processors may spur even
                 more innovative performance optimizations.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance",
  subject =      "{\bf B.3.2} Hardware, MEMORY STRUCTURES, Design
                 Styles, Cache memories. {\bf C.4} Computer Systems
                 Organization, PERFORMANCE OF SYSTEMS, Measurement
                 techniques. {\bf D.3.4} Software, PROGRAMMING
                 LANGUAGES, Processors, Compilers. {\bf B.8.2} Hardware,
                 PERFORMANCE AND RELIABILITY, Performance Analysis and
                 Design Aids.",
}

@Article{Alexandrov:1998:UPG,
  author =       "Albert D. Alexandrov and Maximilian Ibel and Klaus E.
                 Schauser and Chris J. Scheiman",
  title =        "{Ufo}: a Personal Global File System Based on
                 User-Level Extensions to the Operating System",
  journal =      j-TOCS,
  volume =       "16",
  number =       "3",
  pages =        "207--233",
  month =        aug,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-3/p207-alexandrov/",
  abstract =     "In this article we show how to extend a wide range of
                 functionality of standard operation systems completely
                 at the user level. Our approach works by intercepting
                 selected system calls at the user level, using tracing
                 facilities such as the /proc file system provided by
                 many Unix operating systems. The behavior of some
                 intercepted system calls is then modified to implement
                 new functionality. This approach does not require any
                 relinking or recompilation of existing applications. In
                 fact, the extensions can even be dynamically
                 ``installed'' into already running processes. The
                 extensions work completely at the user level and
                 install without system administrator assistance.
                 Individual users can choose what extensions to run, in
                 effect creating a personalized operating system view
                 for themselves. We used this approach to implement a
                 global file system, called Ufo, which allows users to
                 treat remote files exactly as if they were local.
                 Currently, Ufo supports file access through the FTP and
                 HTTP protocols and allows new protocols to be plugged
                 in. While several other projects have implemented
                 global file system abstractions, they all require
                 either changes to the operating system or modifications
                 to standard libraries. The article gives a detailed
                 performance analysis of our approach to extending the
                 OS and establishes that Ufo introduces acceptable
                 overhead for common applications even though
                 intercepting individual system calls incurs a high
                 cost.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "performance",
  subject =      "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management. {\bf D.4.3} Software, OPERATING SYSTEMS,
                 File Systems Management, Distributed file systems. {\bf
                 D.4.3} Software, OPERATING SYSTEMS, File Systems
                 Management, Access methods.",
}

@Article{Gabbay:1998:UVP,
  author =       "Freddy Gabbay and Avi Mendelson",
  title =        "Using Value Prediction to Increase the Power of
                 Speculative Execution Hardware",
  journal =      j-TOCS,
  volume =       "16",
  number =       "3",
  pages =        "234--270",
  month =        aug,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-3/p234-gabbay/",
  abstract =     "This article presents an experimental and analytical
                 study of value prediction and its impact on speculative
                 execution in superscalar microprocessors. Value
                 prediction is a new paradigm that suggests predicting
                 outcome values of operations (at run-time ) and using
                 these predicted values to trigger the execution of
                 true-data-dependent operations speculatively. As a
                 result, stals to memory locations can be reduced and
                 the amount of instruction-level parallelism can be
                 extended beyond the limits of the program's dataflow
                 graph. This article examines the characteristics of the
                 value prediction concept from two perspectives: (1) the
                 related phenomena that are reflected in the nature of
                 computer programs and (2) the significance of these
                 phenomena to boosting instruction-level parallelism of
                 superscalar microprocessors that support speculative
                 execution. In order to better understand these
                 characteristics, our work combines both analytical and
                 experimental studies.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; measurement; performance",
  subject =      "{\bf C.0} Computer Systems Organization, GENERAL,
                 System architectures. {\bf C.1.1} Computer Systems
                 Organization, PROCESSOR ARCHITECTURES, Single Data
                 Stream Architectures, RISC. {\bf C.5.3} Computer
                 Systems Organization, COMPUTER SYSTEM IMPLEMENTATION,
                 Microcomputers, Microprocessors. {\bf C.0} Computer
                 Systems Organization, GENERAL, Instruction set
                 design.",
}

@Article{Juurlink:1998:QCP,
  author =       "Ben H. H. Juurlink and Harry A. G. Wijshoff",
  title =        "A Quantitative Comparison of Parallel Computation
                 Models",
  journal =      j-TOCS,
  volume =       "16",
  number =       "3",
  pages =        "271--318",
  month =        aug,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 13 18:36:53 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-3/p271-juurlink/",
  abstract =     "In recent years, a large number of parallel
                 computation models have been proposed to replace the
                 PRAM as the parallel computation model presented to the
                 algorithm designer. Although mostly the theoretical
                 justifications for these models are sound, and many
                 algorithmic results where obtained through these
                 models, little experimentation has been conducted to
                 validate the effectiveness of these models for
                 developing cost-effective algorithms and applications
                 on existing hardware platforms. In this article a first
                 attempt is made to perform a detailed experimental
                 account on the preciseness of these models. The achieve
                 this, three models (BSP, E-BSP, and BPRAM) were
                 selected and validated on five parallel platforms (Cray
                 T3E, Thinking Machines CM-5, Intel Paragon, MasPar
                 MP-1, and Parsytec GCel). The work described in this
                 article consists of three parts. First, the predictive
                 capabilities of the models are investigated. Unlike
                 previous experimental work, which mostly demonstrated a
                 close match between the measured and predicted
                 execution times, this article shows that there are
                 several situations in which the models do not precisely
                 predict the actual runtime behavior of an algorithm
                 implementation. Second, a comparison between the models
                 is provided in order to determine the model that
                 induces that most efficient algorithms. Lastly, the
                 performance achieved by the model-derived algorithms is
                 compared with the performance attained by
                 machine-specific algorithms in order to examine the
                 effectiveness of deriving fast algorithms through the
                 formalisms of the models.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "experimentation; performance",
  subject =      "{\bf C.1.4} Computer Systems Organization, PROCESSOR
                 ARCHITECTURES, Parallel Architectures. {\bf C.4}
                 Computer Systems Organization, PERFORMANCE OF SYSTEMS,
                 Modeling techniques. {\bf D.1.3} Software, PROGRAMMING
                 TECHNIQUES, Concurrent Programming, Parallel
                 programming.",
}

@Article{Bhatti:1998:CSC,
  author =       "Nina T. Bhatti and Matti A. Hiltunen and Richard D.
                 Schlichting and Wanda Chiu",
  title =        "{Coyote}: a system for constructing fine-grain
                 configurable communication services",
  journal =      j-TOCS,
  volume =       "16",
  number =       "4",
  pages =        "321--366",
  month =        nov,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 26 16:27:34 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-4/p321-bhatti/",
  abstract =     "Communication-oriented abstractions such as atomic
                 multicast, group RPC, and protocols for
                 location-independent mobile computing can simplify the
                 development of complex applications built on
                 distributed systems. This article describes Coyote, a
                 system that supports the construction of highly modular
                 and configurable versions of such abstractions. Coyote
                 extends the notion of protocol objects and hierarchical
                 composition found in existing systems with support for
                 finer-grain microprotocol objects and a nonhierarchical
                 composition scheme for use within a single layer of a
                 protocol stack. A customized service is constructed by
                 selecting microprotocols based on their semantic
                 guarantees and configuring them together with a
                 standard runtime system to form a composite protocol
                 implementing the service. This composite protocol is
                 then composed hierarchically with other protocols to
                 form a complete network subsystem. The overall approach
                 is described and illustrated with examples of services
                 that have been constructed using Coyote, including
                 atomic multicast, group RPC, membership, and mobile
                 computing protocols. A prototype implementation based
                 on extending {\em x\/}-kernel version 3.2 running on
                 Mach 3.0 with support for microprotocols is also
                 presented, together with performance results from a
                 suite of microprotocols from which over 60 variants of
                 group RPC can be constructed.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "design; experimentation; performance; reliability",
  subject =      "{\bf C.2.2} Computer Systems Organization,
                 COMPUTER-COMMUNICATION NETWORKS, Network Protocols,
                 Protocol architecture. {\bf C.2.4} Computer Systems
                 Organization, COMPUTER-COMMUNICATION NETWORKS,
                 Distributed Systems, Distributed applications. {\bf
                 D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent
                 Programming, Distributed programming. {\bf D.4.4}
                 Software, OPERATING SYSTEMS, Communications Management,
                 Network communication. {\bf D.4.5} Software, OPERATING
                 SYSTEMS, Reliability, Fault-tolerance. {\bf D.4.7}
                 Software, OPERATING SYSTEMS, Organization and Design,
                 Distributed systems. {\bf D.2.13} Software, SOFTWARE
                 ENGINEERING, Reusable Software.",
}

@Article{Epema:1998:DUS,
  author =       "D. H. J. Epema",
  title =        "Decay-usage scheduling in multiprocessors",
  journal =      j-TOCS,
  volume =       "16",
  number =       "4",
  pages =        "367--415",
  month =        nov,
  year =         "1998",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 26 16:27:34 MDT 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-4/p367-epema/",
  abstract =     "Decay-usage scheduling is a priority-aging
                 time-sharing scheduling policy capable of dealing with
                 a workload of both interactive and batch jobs by
                 decreasing the priority of a job when it acquires CPU
                 time, and by increasing its priority when it does not
                 use the (a) CPU. In this article we deal with a
                 decay-usage scheduling policy in multiprocessors
                 modeled after widely used systems. The priority of a
                 job consists of a base priority and a time-dependent
                 component based on processor usage. Because t he
                 priorities in our model are time dependent, a
                 queuing-theoretic analysis---for instance, for the mean
                 job response time---seems impossible. Still, it turns
                 out that as a consequence of the scheduling policy, the
                 shares of the available CPU time obtained by jobs
                 converge, and a deterministic analysis for these shares
                 is feasible: We show how for a fixed set of jobs with
                 large processing demands, the steady-state shares can
                 be obtained given the base priorities, and conversely,
                 how to set the base priorities given the required
                 shares. In addition, we analyze the relation between
                 the values of the scheduler parameters and the level of
                 control it can exercise over the steady-state share
                 ratios, and we deal with the rate of convergence. We
                 validate the model by simulations and by measurements
                 of actual systems.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "measurement; performance",
  subject =      "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
                 Management,
                 Multiprocessing/multiprogramming/multitasking. {\bf
                 D.4.1} Software, OPERATING SYSTEMS, Process Management,
                 Scheduling. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Measurements. {\bf D.4.8} Software,
                 OPERATING SYSTEMS, Performance, Modeling and
                 prediction. {\bf D.4.8} Software, OPERATING SYSTEMS,
                 Performance, Simulation.",
}

@Article{Srinivasan:1999:FAL,
  author =       "V. Srinivasan and G. Varghese",
  title =        "Fast address lookups using controlled prefix
                 expansion",
  journal =      j-TOCS,
  volume =       "17",
  number =       "1",
  pages =        "1--40",
  month =        feb,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-1/p1-srinivasan/",
  abstract =     "Internet (IP) address lookup is a major bottleneck in
                 high-performance routers. IP address lookup is
                 challenging because it requires a {\em longest matching
                 prefix\/} lookup. It is compounded by increasing
                 routing table sizes, increased traffic, higher-speed
                 links, and the migration to 128-bit IPv6 addresses. We
                 describe how IP lookups and updates can be made faster
                 using a set of transformation techniques. Our main
                 technique, {\em controlled prefix expansion},
                 transforms a set of prefixes into an equivalent set
                 with fewer prefix lengths. In addition, we use
                 optimization techniques based on dynamic programming,
                 and local transformations of data structures to improve
                 cache behavior. When applied to trie search, our
                 techniques provide a range of algorithms ({\em Expanded
                 Tries\/}) whose performance can be tuned. For example,
                 using a processor with 1MB of L2 cache, search of the
                 MaeEast database containing 38000 prefixes can be done
                 in 3 L2 cache accesses. On a 300MHz Pentium II which
                 takes 4 cycles for accessing the first word of the L2
                 cacheline, this algorithm has a worst-case search time
                 of 180 nsec., a worst-case insert/delete time of 2.5
                 msec., and an average insert/delete time of 4 usec.
                 Expanded tries provide faster search {\em and\/} faster
                 insert/delete times than earlier lookup algorithms.
                 When applied to Binary Search on Levels, our techniques
                 improve worst-case search times by nearly a factor of 2
                 (using twice as much storage) for the MaeEast database.
                 Our approach to algorithm design is based on
                 measurements using the VTune tool on a Pentium to
                 obtain dynamic clock cycle counts. Our techniques also
                 apply to similar address lookup problems in other
                 network protocols.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "binary search on levels; controlled prefix expansion;
                 expanded tries; Internet address lookup; longest-prefix
                 match; multibit tries; router performance",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Local and Wide-Area
                 Networks (C.2.5): {\bf Internet}; Computer Systems
                 Organization --- Computer-Communication Networks ---
                 Network Protocols (C.2.2): {\bf Routing protocols};
                 Computer Systems Organization ---
                 Computer-Communication Networks --- Internetworking
                 (C.2.6): {\bf Routers}",
}

@Article{Birman:1999:BM,
  author =       "Kenneth P. Birman and Mark Hayden and Oznur Ozkasap
                 and Zhen Xiao and Mihai Budiu and Yaron Minsky",
  title =        "Bimodal multicast",
  journal =      j-TOCS,
  volume =       "17",
  number =       "2",
  pages =        "41--88",
  month =        may,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p41-birman/",
  abstract =     "There are many methods for making a multicast protocol
                 ``reliable.'' At one end of the spectrum, a reliable
                 multicast protocol might offer atomicity guarantees,
                 such as all-or-nothing delivery, delivery ordering, and
                 perhaps additional properties such as virtually
                 synchronous addressing. At the other are protocols that
                 use local repair to overcome transient packet loss in
                 the network, offering ``best effort'' reliability. Yet
                 none of this prior work has treated stability of
                 multicast delivery as a basic reliability property,
                 such as might be needed in an internet radio,
                 television, or conferencing application. This article
                 looks at reliability with a new goal: development of a
                 multicast protocol which is reliable in a sense that
                 can be rigorously quantified and includes throughput
                 stability guarantees. We characterize this new protocol
                 as a ``bimodal multicast'' in reference to its
                 reliability model, which corresponds to a family of
                 bimodal probability distributions. Here, we introduce
                 the protocol, provide a theoretical analysis of its
                 behavior, review experimental results, and discuss some
                 candidate applications. These confirm that bimodal
                 multicast is reliable, scalable, and that the protocol
                 provides remarkably stable delivery throughput.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Network
                 Architecture and Design (C.2.1): {\bf Network
                 communications}",
}

@Article{Diniz:1999:ESO,
  author =       "Pedro C. Diniz and Martin C. Rinard",
  title =        "Eliminating synchronization overhead in automatically
                 parallelized programs using dynamic feedback",
  journal =      j-TOCS,
  volume =       "17",
  number =       "2",
  pages =        "89--132",
  month =        may,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p89-diniz/",
  abstract =     "This article presents dynamic feedback, a technique
                 that enables computations to adapt dynamically to
                 different execution environments. A compiler that uses
                 dynamic feedback produces several different versions of
                 the same source code; each version uses a different
                 optimization policy. The generated code alternately
                 performs sampling phases and production phases. Each
                 sampling phase measures the overhead of each version in
                 the current environment. Each production phase uses the
                 version with the least overhead in the previous
                 sampling phase. The computation periodically resamples
                 to adjust dynamically to changes in the environment. We
                 have implemented dynamic feedback in the context of a
                 parallelizing compiler for object-based programs. The
                 generated code uses dynamic feedback to automatically
                 choose the best synchronization optimization policy.
                 Our experimental results show that the synchronization
                 optimization policy has a significant impact on the
                 overall performance of the computation, that the best
                 policy varies from program to program, that the
                 compiler is unable to statically choose the best
                 policy, and that dynamic feedback enables the generated
                 code to exhibit performance that is comparable to that
                 of code that has been manually tuned to use the best
                 policy. We have also performed a theoretical analysis
                 which provides, under certain assumptions, a guaranteed
                 optimality bound for dynamic feedback relative to a
                 hypothetical (and unrealizable) optimal algorithm that
                 uses the best policy at every point during the
                 execution.",
  acknowledgement = ack-nhfb,
  generalterms = "Measurement; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "parallel computing; parallelizing compilers",
  subject =      "Computer Systems Organization --- Performance of
                 Systems (C.4): {\bf Measurement techniques}; Software
                 --- Programming Techniques --- Concurrent Programming
                 (D.1.3); Software --- Programming Techniques ---
                 Object-oriented Programming (D.1.5); Software ---
                 Programming Languages --- Processors (D.3.4): {\bf Code
                 generation}; Software --- Programming Languages ---
                 Processors (D.3.4): {\bf Compilers}; Software ---
                 Programming Languages --- Processors (D.3.4): {\bf
                 Optimization}; Software --- Programming Languages ---
                 Processors (D.3.4): {\bf Run-time environments};
                 Software --- Programming Techniques --- Concurrent
                 Programming (D.1.3): {\bf Parallel programming};
                 Computer Systems Organization --- Performance of
                 Systems (C.4): {\bf Design studies}",
}

@Article{Ronsse:1999:RFI,
  author =       "Michiel Ronsse and Koen {De Bosschere}",
  title =        "{RecPlay}: a fully integrated practical record\slash
                 replay system",
  journal =      j-TOCS,
  volume =       "17",
  number =       "2",
  pages =        "133--152",
  month =        may,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p133-ronsse/",
  abstract =     "This article presents a practical solution for the
                 cyclic debugging of nondeterministic parallel programs.
                 The solution consists of a combination of record\slash
                 replay with automatic on-the-fly data race detection.
                 This combination enables us to limit the record phase
                 to the more efficient recording of the synchronization
                 operations, while deferring the time-consuming data
                 race detection to the replay phase. As the record phase
                 is highly efficient, there is no need to switch it off,
                 hereby eliminating the possibility of Heisenbugs
                 because tracing can be left on all the time. This
                 article describes an implementation of the tools needed
                 to support RecPlay.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Experimentation; Reliability",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "binary code modification; multithreaded programming;
                 race detection",
  subject =      "Software --- Programming Techniques --- Concurrent
                 Programming (D.1.3): {\bf Parallel programming};
                 Software --- Software Engineering --- Testing and
                 Debugging (D.2.5): {\bf Debugging aids}; Software ---
                 Software Engineering --- Testing and Debugging (D.2.5):
                 {\bf Monitors}; Software --- Software Engineering ---
                 Testing and Debugging (D.2.5): {\bf Tracing}; Software
                 --- Operating Systems --- Process Management (D.4.1):
                 {\bf Concurrency}; Software --- Operating Systems ---
                 Process Management (D.4.1): {\bf Deadlocks}; Software
                 --- Operating Systems --- Process Management (D.4.1):
                 {\bf Multiprocessing/multiprogramming/multitasking};
                 Software --- Operating Systems --- Process Management
                 (D.4.1): {\bf Mutual exclusion}; Software --- Operating
                 Systems --- Process Management (D.4.1): {\bf
                 Synchronization}",
}

@Article{Amsaleg:1999:GCC,
  author =       "Laurent Amsaleg and Michael J. Franklin and Olivier
                 Gruber",
  title =        "Garbage collection for a client-server persistent
                 object store",
  journal =      j-TOCS,
  volume =       "17",
  number =       "3",
  pages =        "153--201",
  month =        aug,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-3/p153-amsaleg/",
  abstract =     "We describe an efficient server-based algorithm for
                 garbage collecting persistent object stores in a
                 client-server environment. The algorithm is incremental
                 and runs concurrently with client transactions. Unlike
                 previous algorithms, it does not hold any transactional
                 locks on data and does non require callbacks to
                 clients. It is fault-tolerant, but performs very little
                 logging. The algorithm has been designed to be
                 integrated into existing systems, and therefore it
                 works with standard implementation techniques such as
                 Two-Phase Locking and Write-Ahead-Logging. In addition,
                 it supports client-server performance optimizations
                 such as client caching and flexible management of
                 client buffers. We describe an implementation of the
                 algorithm in the EXODUS storage manager and present the
                 results of a performance study of the implementation.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Measurement; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "client-server system; logging; persistent
                 object-store; recovery",
  subject =      "Software --- Operating Systems --- Storage Management
                 (D.4.2): {\bf Garbage collection}; Information Systems
                 --- Database Management --- Systems (H.2.4): {\bf
                 Distributed databases}; Information Systems ---
                 Database Management --- Systems (H.2.4): {\bf
                 Object-oriented databases}; Information Systems ---
                 Database Management --- Systems (H.2.4): {\bf
                 Transaction processing}",
}

@Article{Raghavachari:1999:ALP,
  author =       "Mukund Raghavachari and Anne Rogers",
  title =        "{Ace}: a language for parallel programming with
                 customizable protocols",
  journal =      j-TOCS,
  volume =       "17",
  number =       "3",
  pages =        "202--248",
  month =        aug,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-3/p202-raghavachari/",
  abstract =     "Customizing the protocols that manage accesses to
                 different data structures within an application can
                 improve the performance of software shared-memory
                 programs substantially. Existing systems for using
                 customizable protocols are hard to use directly because
                 the mechanisms they provide for manipulating protocols
                 are low-level ones. This article is an in-depth study
                 of the issues involved in providing language support
                 for application-specific protocols. We describe the
                 design and implementation of a new language for
                 parallel programming, Ace, that integrates support for
                 customizable protocols with minimal extensions to C.
                 Ace applications are developed using a shared-memory
                 model with a default sequentially consistent protocol.
                 Performance can then be optimized, with minor
                 modifications to the application, by experimenting with
                 different protocol libraries. The design of Ace was
                 driven by a detailed study of the use of customizable
                 protocols. We delineate the issues that arise when
                 programming with customizable protocols and present
                 novel abstractions that allow for their easy use. We
                 describe the design and implementation of a runtime
                 system and compiler for Ace nd discuss compiler
                 optimizations that improve the performance of such
                 software shared-memory systems. We study the
                 communication patterns of a set of benchmark
                 applications and consider the use of customizable
                 protocols to optimize their performance. We evaluate
                 the performance of our system through experiments on a
                 Thinking Machine CM-5 and a Cray T3E. We also present
                 measurements that demonstrate that Ace has good
                 performance compared to that of a modern distributed
                 shared-memory system.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "parallel processing",
  subject =      "Software --- Programming Languages --- Language
                 Constructs and Features (D.3.3); Software ---
                 Programming Languages --- Processors (D.3.4): {\bf
                 Compilers}; Software --- Programming Languages ---
                 Processors (D.3.4): {\bf Run-time environments};
                 Software --- Programming Languages --- Language
                 Classifications (D.3.2); Software --- Programming
                 Techniques --- Concurrent Programming (D.1.3): {\bf
                 Parallel programming}",
}

@Article{Hari:1999:APS,
  author =       "Adiseshu Hari and George Varghese and Guru Parulkar",
  title =        "An architecture for packet-striping protocols",
  journal =      j-TOCS,
  volume =       "17",
  number =       "4",
  pages =        "249--287",
  month =        nov,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/1999-17-4/p249-hari/p249-hari.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/1999-17-4/p249-hari/",
  abstract =     "Link-striping algorithms are often used to overcome
                 transmission bottlenecks in computer networks.
                 Traditional striping algorithms suffer from two major
                 disadvantages. They provide inadequate load sharing in
                 the presence of variable-length packets, and may result
                 in non-FIFO delivery of data. We describe a new family
                 of link-striping algorithms that solves both problems.
                 Our scheme applies to any layer that can provide
                 multiple FIFO channels. We deal with variable-sized
                 packets by showing how fair-queuing algorithms can be
                 transformed into load-sharing algorithms. Our
                 transformation results in practical load-sharing
                 protocols, and shows a theoretical connection between
                 two seemingly different problems. The same
                 transformation can be applied to obtain load-sharing
                 protocols for links with different capacities. We deal
                 with the FIFO requirement for two separate cases. If a
                 sequence number can be added to each packet, we show
                 how to speed up packet processing by letting the
                 receiver simulate the sender algorithm. If no header
                 can be added, we show how to provide quasi FIFO
                 delivery. Quasi FIFO is FIFO except during occasional
                 periods of loss of synchronization. We argue that quasi
                 FIFO is adequate for most applications. We also
                 describe a simple technique for speedy restoration of
                 synchronization in the event of loss. We develop an
                 architectural framework for transparently embedding our
                 protocol at the network level by striping IP packets
                 across multiple physical interfaces. The resulting
                 stripe protocol has been implemented within the NetBSD
                 kernel. Our measurements and simulations show that the
                 protocol offers scalable throughput even when striping
                 is done over dissimilar links, and that the protocol
                 synchronized quickly after packet loss. Measurements
                 show performance improvements over conventional
                 round-robin striping schemes and striping schemes that
                 do not resequence packets. Some aspects of our solution
                 have been implemented in Cisco's router operating
                 system (IOS 11.3) in the context of Multilink PPP
                 striping.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Design; Measurement; Performance; Theory",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "causal fair queuing; fair queuing; load sharing;
                 multilink PPP; packet striping; stripe protocol;
                 striping",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Network Protocols
                 (C.2.2): {\bf Protocol architecture}",
}

@Article{McKinley:1999:QLN,
  author =       "Kathryn S. McKinley and Olivier Temam",
  title =        "Quantifying loop nest locality using {SPEC'95} and the
                 {Perfect} benchmarks",
  journal =      j-TOCS,
  volume =       "17",
  number =       "4",
  pages =        "288--336",
  month =        nov,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-4/p288-mckinley/",
  abstract =     "This article analyzes and quantifies the locality
                 characteristics of numerical loop nests in order to
                 suggest future directions for architecture and software
                 cache optimizations. Since most programs spend the
                 majority of their time in nests, the vast majority of
                 cache optimization techniques target loop nests. In
                 contrast, the locality characteristics that drive these
                 optimizations are usually collected across the entire
                 application rather than at the nest level. Researchers
                 have studied numerical codes for so long that a number
                 of commonly held assertions have emerged on their
                 locality characteristics. In light of these assertions,
                 we use the SPEC'95 and Perfect Benchmarks to take a new
                 look at measuring locality on numerical codes based on
                 references, loop nests, and program locality
                 properties. Our results show that several popular
                 assertions are at best overstatements. For example,
                 although most reuse is within a loop nest, in line with
                 popular assertions, most misses are internest capacity
                 misses, and they correspond to potential reuse between
                 nearby loop nests. In addition, we find that temporal
                 and spatial reuse have balanced roles within a loop
                 nest and that most reuse across nests and the entire
                 program is temporal. These results are consistent with
                 high hit rates (80\% or more hits), but go against the
                 commonly held assumption that spatial reuse dominates.
                 Our locality measurements reveal important differences
                 between loop nests and programs, refute some popular
                 assertions, and provide new insights for the compiler
                 writer and the architect.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  subject =      "Computer Systems Organization --- Performance of
                 Systems (C.4): {\bf Performance attributes}; Computer
                 Systems Organization --- Performance of Systems (C.4):
                 {\bf Measurement techniques}",
}

@Article{Rinard:1999:EFG,
  author =       "Martin C. Rinard",
  title =        "Effective fine-grain synchronization for automatically
                 parallelized programs using optimistic synchronization
                 primitives",
  journal =      j-TOCS,
  volume =       "17",
  number =       "4",
  pages =        "337--371",
  month =        nov,
  year =         "1999",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/1999-17-4/p337-rinard/",
  abstract =     "This article presents our experience using optimistic
                 synchronization to implement fine-grain atomic
                 operations in the context of a parallelizing compiler
                 for irregular, object-based computations. Our
                 experience shows that the synchronization requirements
                 of these programs differ significantly from those of
                 traditional parallel computations, which use loop nests
                 to access dense matrices using affine access functions.
                 In addition to coarse-grain barrier synchronization,
                 our irregular computations require synchronization
                 primitives that support efficient fine-grain atomic
                 operations. The standard implementation mechanism for
                 atomic operations uses mutual exclusion locks. But the
                 overhead of acquiring and releasing locks can reduce
                 the performance. Locks can also consume significant
                 amounts of memory. Optimistic synchronization
                 primitives such as {\em load-linked/store
                 conditional\/} are an attractive alternative. They
                 require no additional memory and eliminate the use of
                 heavyweight blocking synchronization constructs. We
                 evaluate the effectiveness of optimistic
                 synchronization by comparing experimental results from
                 two versions of a parallelizing compiler for irregular,
                 object-based computations. One version generates code
                 that uses mutual exclusion locks to make operations
                 execute atomically. The other version generates code
                 that uses mutual exclusion locks to make operations
                 execute atomically. The other version uses optimistic
                 synchronization. We used this compiler to automatically
                 parallelize three irregular, object-based benchmark
                 applications of interest to the scientific and
                 engineering computation community. The presented
                 experimental results indicate that the use of
                 optimistic synchronization in this context can
                 significantly reduce the memory consumption and improve
                 the overall performance.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Experimentation; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "atomic operations commutativity analysis; optimistic
                 synchronization; parallel computing; parallelizing
                 compilers; synchronization",
  subject =      "Software --- Programming Languages --- Processors
                 (D.3.4): {\bf Compilers}",
}

@Article{Keleher:2000:HLA,
  author =       "Peter J. Keleher",
  title =        "A high-level abstraction of shared accesses",
  journal =      j-TOCS,
  volume =       "18",
  number =       "1",
  pages =        "1--36",
  month =        feb,
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-1/p1-keleher/",
  abstract =     "We describe the design and use of the {\em tape\/}
                 mechanism, a new high-level abstraction of accesses to
                 shared data for software DSMs. Tapes consolidate and
                 generalize a number of recent protocol optimizations,
                 including update-based locks and recorded-replay
                 barriers. Tapes are usually created by ``recording''
                 shared accesses. The resulting recordings can be used
                 to anticipate future accesses by tailoring data
                 movement to application semantics. Tapes-based
                 mechanisms are layered on top of existing shared-memory
                 protocols, and are largely independent of the
                 underlying memory model. Tapes can also be used to
                 emulate the data-movement semantics of several
                 update-based protocol implementations, without altering
                 the underlying protocol implementation. We have used
                 tapes to create the Tapeworm synchronization library.
                 Tapeworm implements sophisticated record-replay
                 mechanisms across barriers, augments locks with
                 data-movement semantics, and allows the use of
                 producer-consumer segments, which move entire modified
                 segments when any portion of the segment is accessed.
                 We show that Tapeworm eliminates 85\% of remote misses,
                 reduces message traffic by 63\%, and improves
                 performance by an average of 29\% for our application
                 suite.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "DSM; programming libraries; shared memory; update
                 protocols",
  subject =      "Software --- Operating Systems --- Storage Management
                 (D.4.2); Software --- Operating Systems --- File
                 Systems Management (D.4.3); Software --- Operating
                 Systems --- File Systems Management (D.4.3): {\bf
                 Access methods}; Software --- Operating Systems ---
                 File Systems Management (D.4.3): {\bf Distributed file
                 systems}",
}

@Article{Pai:2000:ILU,
  author =       "Vivek S. Pai and Peter Druschel and Willy Zwaenepoel",
  title =        "{IO-Lite}: a unified {I/O} buffering and caching
                 system",
  journal =      j-TOCS,
  volume =       "18",
  number =       "1",
  pages =        "37--66",
  month =        feb,
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-1/p37-pai/",
  abstract =     "This article presents the design, implementation, and
                 evaluation of IO -Lite, a unified I/O buffering and
                 caching system for general-purpose operating systems.
                 IO-Lite unifies {\em all\/} buffering and caching in
                 the system, to the extent permitted by the hardware. In
                 particular, it allows applications, the interprocess
                 communication system, the file system, the file cache,
                 and the network subsystem to safely and concurrently
                 share a single physical copy of the data. Protection
                 and security are maintained through a combination of
                 access control and read-only sharing. IO-Lite
                 eliminates all copying and multiple buffering of I/O
                 data, and enables various cross-subsystem
                 optimizations. Experiments with a Web server show
                 performance improvements between 40 and 80\% on real
                 workloads as a result of IO-Lite.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "caching; I/O buffering; networking; zero-copy",
  subject =      "Software --- Operating Systems --- Communications
                 Management (D.4.4); Software --- Operating Systems ---
                 Performance (D.4.8)",
}

@Article{Schwartz:2000:SPA,
  author =       "Beverly Schwartz and Alden W. Jackson and W. Timothy
                 Strayer and Wenyi Zhou and R. Dennis Rockwell and Craig
                 Partbridge",
  title =        "Smart packets: applying active networks to network
                 management",
  journal =      j-TOCS,
  volume =       "18",
  number =       "1",
  pages =        "67--88",
  month =        feb,
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-1/p67-schwartz/",
  abstract =     "This article introduces Smart Packets and describes
                 the smart Packets architecture, the packet formats, the
                 language and its design goals, and security
                 considerations. Smart Packets is an Active Networks
                 project focusing on applying active networks technology
                 to network management and monitoring. Messages in
                 active networks are programs that are executed at nodes
                 on the path to one or more target hosts. Smart Packets
                 programs are written in a tightly encoded, safe
                 language specifically designed to support network
                 management and avoid dangerous constructs and accesses.
                 Smart Packets improves the management of large complex
                 networks by (1) moving management decision points
                 closer to the node being managed, (2) targeting
                 specific aspects of the node for information rather
                 than exhaustive collection via polling, and (3)
                 abstracting the management concepts to language
                 constructs, allowing nimble network control.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "active networks",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Network
                 Architecture and Design (C.2.1); Computer Systems
                 Organization --- Computer-Communication Networks ---
                 Network Operations (C.2.3); Software --- Programming
                 Languages --- Language Constructs and Features
                 (D.3.3)",
}

@Article{Brooks:2000:VBC,
  author =       "David Brooks and Margaret Martonosi",
  title =        "Value-based clock gating and operation packing:
                 dynamic strategies for improving processor power and
                 performance",
  journal =      j-TOCS,
  volume =       "18",
  number =       "2",
  pages =        "89--126",
  month =        may,
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-2/p89-brooks/",
  abstract =     "The large address space needs of many current
                 applications have pushed processor designs toward
                 64-bit word widths. Although full 64-bit addresses and
                 operations are indeed sometimes needed, arithmetic
                 operations on much smaller quantities are still more
                 common. In fact, another instruction set trend has been
                 the introduction of instructions geared toward subword
                 operations on 16-bit quantities. For examples, most
                 major processors now include instruction set support
                 for multimedia operations allowing parallel execution
                 of several subword operations in the same ALU. This
                 article presents our observations demonstrating that
                 operations on ``narrow-width'' quantities are common
                 not only in multimedia codes, but also in more general
                 workloads. In fact, across the SPECint95 benchmarks,
                 over half the integer operation executions require 16
                 bits or less. Based on this data, we propose two
                 hardware mechanisms that dynamically recognize and
                 capitalize on these narrow-width operations. The first,
                 power-oriented optimization reduces processor power
                 consumption by using operand-value-based clock gating
                 to turn off portions of arithmetic units that will be
                 unused by narrow-width operations. This optimization
                 results in a 45\%--60\% reduction in the integer unit's
                 power consumption for the SPECint95 and MediaBench
                 benchmark suites. Applying this optimization to
                 SPECfp95 benchmarks results in slightly smaller power
                 reductions, but still seems warranted. These reductions
                 in integer unit power consumption equate to a 5\%--10\%
                 full-chip power savings. Our second,
                 performance-oriented optimization improves processor
                 performance by packing together narrow-width operations
                 so that they share a single arithmetic unit.
                 Conceptually similar to a dynamic form of MMX, this
                 optimization offers speedups of 4.3\%--6.2\% for
                 SPECint95 and 8.0\%--10.4\% for MediaBench.
                 \par

                 Overall, these optimizations highlight an increasing
                 opportunity for value-based optimizations to improve
                 both power and performance in current
                 microprocessors.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  subject =      "Hardware --- Arithmetic and Logic Structures (B.2);
                 Computer Systems Organization --- Processor
                 Architectures --- Single Data Stream Architectures
                 (C.1.1): {\bf RISC/CISC, VLIW architectures}",
}

@Article{Ganger:2000:SUS,
  author =       "Gregory R. Ganger and Marshall Kirk McKusick and Craig
                 A. N. Soules and Yale N. Patt",
  title =        "Soft updates: a solution to the metadata update
                 problem in file systems",
  journal =      j-TOCS,
  volume =       "18",
  number =       "2",
  pages =        "127--153",
  month =        may,
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-2/p127-ganger/",
  abstract =     "Metadata updates, such as file creation and block
                 allocation, have consistently been identified as a
                 source of performance, integrity, security, and
                 availability problems for files systems. Soft updates
                 is an implementation technique for low-cost sequencing
                 of fine-grained updates to write-back cache blocks.
                 Using soft updates to track and enforce metadata update
                 dependencies, a file system can safely use delayed
                 writes for almost all file operations. This article
                 describes soft updates, their incorporation into the
                 4.4BSD fast file system, and the resulting effects on
                 the system. We show that a disk-based file system using
                 soft updates achieves memory-based file system
                 performance while providing stronger integrity and
                 security guarantees than most disk-based file systems.
                 For workloads that frequently perform updates on
                 metadata (such as creating and deleting files), this
                 improves performance by more than a factor of two, a
                 factor of 20 when compared to the conventional
                 synchronous write approach, and by 4--19\% when
                 compared to an aggressive write-ahead logging approach.
                 In addition, soft updates can improve file system
                 availability by relegating crash-recovery assistance
                 (e.g., the {\em fsck\/} utility) to an optional and
                 background role, reducing file system recovery time to
                 less than one second.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  subject =      "Computer Systems Organization --- Performance of
                 Systems (C.4): {\bf Design studies}; Computer Systems
                 Organization --- Performance of Systems (C.4): {\bf
                 Reliability, availability, and serviceability};
                 Computer Systems Organization --- Computer System
                 Implementation --- Servers (C.5.5); Software ---
                 Operating Systems --- Storage Management (D.4.2);
                 Software --- Operating Systems --- File Systems
                 Management (D.4.3); Data --- Files (E.5); Information
                 Systems --- Information Storage and Retrieval ---
                 Information Storage (H.3.2)",
}

@Article{Yeung:2000:MSM,
  author =       "Donald Yeung and John Kubiatowicz and Anant Agarwal",
  title =        "Multigrain shared memory",
  journal =      j-TOCS,
  volume =       "18",
  number =       "2",
  pages =        "154--196",
  month =        may,
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Sep 26 07:54:31 MDT 2000",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-2/p154-yeung/",
  abstract =     "Parallel workstations, each comprising tens of
                 processors based on shared memory, promise
                 cost-effective scalable multiprocessing. This article
                 explores the coupling of such small- to medium-scale
                 shared-memory multiprocessors through software over a
                 local area network to synthesize larger shared-memory
                 systems. We call these systems Distributed
                 Shared-memory MultiProcessors (DSMPs). This article
                 introduces the design of a shared-memory system that
                 uses multiple granularities of sharing, called MGS, and
                 presents a prototype implementation of MGS on the MIT
                 Alewife multiprocessor. Multigrain shared memory
                 enables the collaboration of hardware and software
                 shared memory, thus synthesizing a single transparent
                 shared-memory address space across a cluster of
                 multiprocessors. The system leverages the efficient
                 support for fine-grain cache-line sharing within
                 multiprocessor nodes as often as possible, and resorts
                 to coarse-grain page-level sharing across nodes only
                 when absolutely necessary. Using our prototype
                 implementation of MGS, an in-depth study of several
                 shared-memory application is conducted to understand
                 the behavior of DSMPs. Our study is the first to
                 comprehensively explore the DSMP design space, and to
                 compare the performance of DSMPs against all-software
                 and all-hardware DSMs on a single experimental
                 platform. Keeping the total number of processors fixed,
                 we show that applications execute up to 85\% faster on
                 a DSMP as compared to an all-software DSM. We also show
                 that all-hardware DSMs hold a significant performance
                 advantage over DSMPs on challenging applications,
                 between 159\% and 1014\%. However, program
                 transformations to improve data locality for these
                 applications allow DSMPs to almost match the
                 performance of an all-hardware multiprocessor of the
                 same size.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  subject =      "Hardware --- Memory Structures --- Design Styles
                 (B.3.2): {\bf Shared memory}; Computer Systems
                 Organization --- Processor Architectures --- Multiple
                 Data Stream Architectures (Multiprocessors) (C.1.2)",
}

@Article{Aron:2000:STE,
  author =       "Mohit Aron and Peter Druschel",
  title =        "Soft timers: efficient microsecond software timer
                 support for network processing",
  journal =      j-TOCS,
  volume =       "18",
  number =       "3",
  pages =        "197--228",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2000-18-3/p197-aron/p197-aron.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2000-18-3/p197-aron/",
  abstract =     "This paper proposes and evaluates soft timers, a new
                 operating system facility that allows the efficient
                 scheduling of software events at a granularity down to
                 tens of microseconds. Soft timers can be used to avoid
                 interrupts and reduce context switches associated with
                 network processing, without sacrificing low
                 communication delays. More specifically, soft timers
                 enable transport protocols like TCP to efficiently
                 perform rate-based clocking of packet transmissions.
                 Experiments indicate that soft timers allow a server to
                 employ rate-based clocking with little CPU overhead
                 (2-6\%) at high aggregate bandwidths. Soft timers can
                 also be used to perform network polling, which
                 eliminates network interrupts and increases the memory
                 access locality of the network subsystem without
                 sacrificing delay. Experiments show that this technique
                 can improve the throughput of a Web server by up to
                 25\%.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "polling; timers; transmission scheduling",
  subject =      "Computer Systems Organization --- Computer System
                 Implementation --- Servers (C.5.5); Software ---
                 Operating Systems --- Process Management (D.4.1): {\bf
                 Scheduling}; Software --- Operating Systems ---
                 Communications Management (D.4.4): {\bf Network
                 communication}",
}

@Article{Govil:2000:CDR,
  author =       "Kingshuk Govil and Dan Teodosiu and Yongqiang Huang
                 and Mendel Rosenblum",
  title =        "Cellular disco: resource management using virtual
                 clusters on shared-memory multiprocessors",
  journal =      j-TOCS,
  volume =       "18",
  number =       "3",
  pages =        "229--262",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Nov 13 18:22:48 MST 2000",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-3/p229-govil/",
  abstract =     "Despite the fact that large-scale shared-memory
                 multiprocessors have been commercially available for
                 several years, system software that fully utilizes all
                 their features is still not available, mostly due to
                 the complexity and cost of making the required changes
                 to the operating system. A recently proposed approach,
                 called Disco, substantially reduces this development
                 cost by using a virtual machine monitor that leverages
                 the existing operating system technology. In this paper
                 we present a system called Cellular Disco that extends
                 the Disco work to provide all the advantages of the
                 hardware partitioning and scalable operating system
                 approaches. We argue that Cellular Disco can achieve
                 these benefits at only a small fraction of the
                 development cost of modifying the operating system.
                 Cellular Disco effectively turns a large-scale
                 shared-memory multiprocessor into a virtual cluster
                 that supports fault containment and heterogeneity,
                 while avoiding operating system scalability
                 bottlenecks. Yet at the same time, Cellular Disco
                 preserves the benefits of a shared-memory
                 multiprocessor by implementing dynamic, fine-grained
                 resource sharing, and by allowing users to overcommit
                 resources such as processors and memory. This hybrid
                 approach requires a scalable resource manager that
                 makes local decisions with limited information while
                 still providing good global performance and fault
                 containment. In this paper we describe our experience
                 with a Cellular Disco prototype on a 32-processor SGI
                 Origin 2000 system. We show that the execution time
                 penalty for this approach is low, typically within 10\%
                 of the best available commercial operating system
                 foremost workloads, and that it can manage the CPU and
                 memory resources of the machine significantly better
                 than the hardware partitioning approach.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Management; Performance; Reliability",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "fault containment; resource management; scalable
                 multiprocessors; virtual machines",
  subject =      "Software --- Operating Systems --- Process Management
                 (D.4.1); Software --- Operating Systems --- Storage
                 Management (D.4.2); Software --- Operating Systems ---
                 Reliability (D.4.5); Computer Systems Organization ---
                 Processor Architectures (C.1)",
}

@Article{Kohler:2000:CMR,
  author =       "Eddie Kohler and Robert Morris and Benjie Chen and
                 John Jannotti and M. Frans Kaashoek",
  title =        "The click modular router",
  journal =      j-TOCS,
  volume =       "18",
  number =       "3",
  pages =        "263--297",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Nov 13 18:22:48 MST 2000",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-3/p263-kohler/",
  abstract =     "Clicks is a new software architecture for building
                 flexible and configurable routers. A Click router is
                 assembled from packet processing modules called {\em
                 elements}. Individual elements implement simple router
                 functions like packet classification, queuing,
                 scheduling, and interfacing with network devices. A
                 router configurable is a directed graph with elements
                 at the vertices; packets flow along the edges of the
                 graph. Several features make individual elements more
                 powerful and complex configurations easier to write,
                 including {\em pull connections}, which model packet
                 flow driven by transmitting hardware devices, and {\em
                 flow-based router context}, which helps an element
                 locate other interesting elements. Click configurations
                 are modular and easy to extend. A standards-compliant
                 Click IP router has 16 elements on its forwarding path;
                 some of its elements are also useful in Ethernet
                 switches and IP tunnelling configurations. Extending
                 the IP router to support dropping policies, fairness
                 among flows, or Differentiated Services simply requires
                 adding a couple of element at the right place. On
                 conventional PC hardware, the Click IP router achieves
                 a maximum loss-free forwarding rate of 333,000 64-byte
                 packets per second, demonstrating that Click's modular
                 and flexible architecture is compatible with good
                 performance.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Management; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "component systems; routers; software router
                 performance",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Network
                 Architecture and Design (C.2.1): {\bf Packet-switching
                 networks}; Computer Systems Organization ---
                 Computer-Communication Networks --- Internetworking
                 (C.2.6): {\bf Routers}; Software --- Software
                 Engineering --- Software Architectures (D.2.11): {\bf
                 Domain-specific architectures}",
}

@Article{Saito:2000:MAP,
  author =       "Yasushi Saito and Brian N. Bershad and Henry M. Levy",
  title =        "Manageability, availability, and performance in
                 {Porcupine}: a highly scalable, cluster-based mail
                 service",
  journal =      j-TOCS,
  volume =       "18",
  number =       "3",
  pages =        "298--298",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Nov 13 18:22:48 MST 2000",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2000-18-3/p298-saito/",
  abstract =     "This paper describes the motivation, design and
                 performance of Porcupine, a scalable mail server. The
                 goal of Porcupine is to provide a highly available and
                 scalable electronic mail service using a large cluster
                 of commodity PCs. We designed Porcupine to be easy to
                 manage by emphasizing dynamic load balancing, automatic
                 configuration, and graceful degradation in the presence
                 of failures. Key to the system's manageability,
                 availability, and performance is that sessions, data,
                 and underlying services are distributed homogeneously
                 and dynamically across nodes in a cluster.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Management; Performance; Reliability",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "cluster; distributed systems; email; group membership
                 protocol; load balancing; replication",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Distributed Systems
                 (C.2.4): {\bf Distributed applications}; Computer
                 Systems Organization --- Performance of Systems (C.4):
                 {\bf Reliability, availability, and serviceability};
                 Computer Systems Organization --- Computer System
                 Implementation --- Servers (C.5.5); Software ---
                 Operating Systems --- Reliability (D.4.5): {\bf
                 Fault-tolerance}; Information Systems --- Information
                 Storage and Retrieval --- Systems and Software (H.3.4):
                 {\bf Distributed systems}; Information Systems ---
                 Information Systems Applications --- Communications
                 Applications (H.4.3): {\bf Electronic mail}",
}

@Article{Gontmakher:2000:JCN,
  author =       "Alex Gontmakher and Assaf Schuster",
  title =        "{Java} consistency: nonoperational characterizations
                 for {Java} memory behavior",
  journal =      j-TOCS,
  volume =       "18",
  number =       "4",
  pages =        "333--386",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2000-18-4/p333-gontmakher/p333-gontmakher.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2000-18-4/p333-gontmakher/",
  abstract =     "The Java Language Specification (JLS) [Gosling et al.
                 1996] provides an operational definition for the
                 consistency of shared variables. The definition remains
                 unchanged in the JLS 2nd edition, currently under peer
                 review, which relies on a specific abstract machine as
                 its underlying model, is very complicated. Several
                 subsequent works have tried to simplify and formalize
                 it. However, these revised definitions are also
                 operational, and thus have failed to highlight the
                 intuition behind the original specification. In this
                 work we provide a complete nonoperational specification
                 for Java and for the JVM, excluding synchronized
                 operations. We provide a simpler definition, in which
                 we clearly distinguish the consistency model that is
                 promised to the programmer from that which should be
                 implemented in the JVM. This distinction, which was
                 implicit in the original definition, is crucial for
                 building the JVM. We find that the programmer model is
                 strictly weaker than that of the JVM, and precisely
                 define their discrepancy. Moreover, our definition is
                 independent of any specific (or even abstract) machine,
                 and can thus be used to verify JVM implementations and
                 compiler optimizations on any platform. Finally, we
                 show the precise range of consistency relaxations
                 obtainable for the Java memory model when a certain
                 compiler optimization-- called {\em prescient stores\/}
                 in JLS--is applicable.",
  acknowledgement = ack-nhfb,
  generalterms = "Verification",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Java memory models; multithreading; nonoperational
                 specification",
  subject =      "Hardware --- Memory Structures --- Performance
                 Analysis and Design Aids** (B.3.3): {\bf Formal
                 models**}",
}

@Article{Sarkar:2000:HBC,
  author =       "Prasenjit Sarkar and John H. Hartman",
  title =        "Hint-based cooperative caching",
  journal =      j-TOCS,
  volume =       "18",
  number =       "4",
  pages =        "387--419",
  year =         "2000",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2000-18-4/p387-sarkar/p387-sarkar.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2000-18-4/p387-sarkar/",
  abstract =     "This article presents the design, implementation, and
                 measurement of a hint-based cooperative caching file
                 system. Hints allow clients to make decisions based on
                 local state, enabling a loosely coordinated system that
                 is simple to implement. The resulting performance is
                 comparable to that of existing tightly coordinated
                 algorithms that use global state, but with less
                 overhead. Simulations show that the block access times
                 of our system are as good as those of the existing
                 algorithms, while reducing manager load by more than a
                 factor of seven, block lookup traffic by nearly a
                 factor of two-thirds, and replacement traffic a factor
                 of five. To verify our simulation results in a real
                 system with real users, we implemented a prototype and
                 measured its performance for one week. Although the
                 simulation and prototype environments were very
                 different, the prototype system mirrored the simulation
                 results by exhibiting reduced overhead and high hint
                 accuracy. Furthermore, hint-based cooperative caching
                 reduced the average block access time to almost half
                 that of NFS.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Design; Measurement; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "cooperative caching; hints",
  subject =      "Software --- Operating Systems --- File Systems
                 Management (D.4.3)",
}

@Article{Bilas:2001:ASV,
  author =       "Angelos Bilas and Dongming Jiang and Jaswinder Pal
                 Singh",
  title =        "Accelerating shared virtual memory via general-purpose
                 network interface support",
  journal =      j-TOCS,
  volume =       "19",
  number =       "1",
  pages =        "1--35",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-1/p1-bilas/p1-bilas.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-1/p1-bilas/",
  abstract =     "Clusters of symmetric multiprocessors (SMPs) are
                 important platforms for high-performance computing.
                 With the success of hardware cache-coherent distributed
                 shared memory (DSM), a lot of effort has also been made
                 to support the coherent shared-address-space
                 programming model in software on clusters. Much
                 research has been done in fast communication on
                 clusters and in protocols for supporting software
                 shared memory across them. However, the performance of
                 software virtual memory (SVM) is still far from that
                 achieved on hardware DSM systems. The goal of this
                 paper is to improve the performance of SVM on system
                 area network clusters by considering communication and
                 protocol layer interactions. We first examine what are
                 the important communication system bottlenecks that
                 stand in the way of improving parallel performance of
                 SVM clusters; in particular, which parameters of the
                 communication architecture are most important to
                 improve further relative to processor speed, which ones
                 are already adequate on modern systems for most
                 applications, and how will this change with technology
                 in the future. We find that the most important
                 communication subsystem cost to improve is the overhead
                 of generating and delivery interrupts for asynchronous
                 protocol processing. Then we proceed to show, that by
                 providing simple and general support for asynchronous
                 message handling in a commodity network interface (NI)
                 and by altering SVM protocols appropriately, protocol
                 activity can be decoupled from asynchronous message
                 handling, and the need for interrupts or polling can be
                 eliminated. The NI mechanisms needed are generic, not
                 SVM-dependent. We prototype the mechanisms and such a
                 {\em synchronous home-based LRC\/} protocol, called
                 {\em GeNIMA\/} (GEneral-purpose Network Interface
                 support for shared Memory Abstractions), on a cluster
                 of SMPs with a programmable NI. We find that the
                 performance improvements are substantial, bringing
                 performance on a small-scale SMP cluster much closer to
                 that of hardware-coherent shared memory for many
                 applications, and we show the value of each of the
                 mechanisms in different applications.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "applications; clusters; shared virtual memory; system
                 area networks",
  subject =      "Computer Systems Organization --- Performance of
                 Systems (C.4)",
}

@Article{Grimm:2001:SAC,
  author =       "Robert Grimm and Brian N. Bershad",
  title =        "Separating access control policy, enforcement, and
                 functionality in extensible systems",
  journal =      j-TOCS,
  volume =       "19",
  number =       "1",
  pages =        "36--70",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-1/p36-grimm/p36-grimm.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-1/p36-grimm/",
  abstract =     "Extensible systems, such as Java or the SPIN
                 extensible operating system, allow for units of code,
                 or extensions, to be added to a running system in
                 almost arbitrary fashion. Extensions closely interact
                 through low-latency but type-safe interfaces to form a
                 tightly integrated system. As extensions can come from
                 arbitrary sources, not all of whom can be trusted to
                 conform to an organization's security policy, such
                 structuring raises the question of how security
                 constraints are enforced in an extensible system. In
                 this paper, we present an access control mechanism for
                 extensible systems to address this problem. Our access
                 control mechanism decomposes access control into a
                 policy-neutral enforcement manager and a security
                 policy manager, and it is transparent to extensions in
                 the absence of security violations. It structures the
                 system into protection domains, enforces protection
                 domains through access control checks, and performs
                 auditing of system operations. The access control
                 mechanism works by inspecting extensions for their
                 types and operations to determine which abstractions
                 require protection and by redirecting procedure or
                 method invocations to inject access control operations
                 into the system. We describe the design of this access
                 control mechanism, present an implementation within the
                 SPIN extensible operating systems, and provide a
                 qualitative as well as quantitative evaluation of the
                 mechanism.",
  acknowledgement = ack-nhfb,
  generalterms = "Security",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "access check; auditing; extensible systems; Java;
                 policy-neutral enforcement; protection domain;
                 protection domain transfer; security policy; SPIN",
  subject =      "Software --- Operating Systems (D.4); Software ---
                 Operating Systems --- General (D.4.0); Software ---
                 Operating Systems --- Security and Protection (D.4.6):
                 {\bf Access controls}",
}

@Article{Luk:2001:ACS,
  author =       "Chi-Keung Luk and Todd C. Mowry",
  title =        "Architectural and compiler support for effective
                 instruction prefetching: a cooperative approach",
  journal =      j-TOCS,
  volume =       "19",
  number =       "1",
  pages =        "71--109",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-1/p71-luk/p71-luk.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-1/p71-luk/",
  abstract =     "Instruction cache miss latency is becoming an
                 increasingly important performance bottleneck,
                 especially for commercial applications. Although
                 instruction prefetching is an attractive technique for
                 tolerating this latency, we find that existing
                 prefetching schemes are insufficient for modern
                 superscalar processors, since they fail to issue
                 prefetches early enough (particularly for nonsequential
                 accesses). To overcome these limitations, we propose a
                 new instruction prefetching technique whereby the
                 hardware and software {\em cooperate\/} to hide the
                 latency as follows. The hardware performs aggressive
                 sequential prefetching combined with a novel {\em
                 prefetch filtering\/} mechanism to allow it to get far
                 ahead without polluting the cache. To hide the latency
                 of nonsequential accesses, we propose and implement a
                 novel compiler algorithm which automatically inserts
                 {\em instruction-??\/} --- prefetch the targets of
                 control transfers far enough in advance. Our
                 experimental results demonstrate that this new approach
                 hides 50\% or more of the latency remaining with the
                 best previous techniques, while at the same time
                 reduces the number of useless prefetches by a factor of
                 six. We find that both the {\em prefetch filtering\/}
                 and {\em compiler-inserted prefetching\/} components of
                 our design are essential and complementary, and that
                 the compiler can limit the code expansion to only 9\%
                 on average. In addition, we show that the performance
                 of our technique can be further increased by using
                 profiling information to help reduce cache conflicts
                 and unnecessary prefetches. From an architectural
                 perspective, these performance advantages are sustained
                 over a range of common miss latencies and bandwidth.
                 Finally, our technique is cost effective as well, since
                 it delivers performance comparable to (or even better
                 than) that of larger caches, but requires a much
                 smaller hardware budget.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Experimentation; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "compiler optimization; instruction prefetching",
  subject =      "Software --- Programming Languages --- Processors
                 (D.3.4): {\bf Compilers}; Software --- Programming
                 Languages --- Processors (D.3.4): {\bf Optimization};
                 Hardware --- Memory Structures --- Design Styles
                 (B.3.2): {\bf Cache memories}",
}

@Article{Brown:2001:CBP,
  author =       "Angela Demke Brown and Todd C. Mowry and Orran
                 Krieger",
  title =        "Compiler-based {I/O} prefetching for out-of-core
                 applications",
  journal =      j-TOCS,
  volume =       "19",
  number =       "2",
  pages =        "111--170",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-2/p111-brown/p111-brown.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-2/p111-brown/",
  abstract =     "Current operating systems offer poor performance when
                 a numeric application's working set does not fit in
                 main memory. As a result, programmers who wish to solve
                 ``out-of-core'' problems efficiently are typically
                 faced with the onerous task of rewriting an application
                 to use explicit I/O operations (e.g., read/write). In
                 this paper, we propose and evaluate a fully automatic
                 technique which liberates the programmer from this
                 task, provides high performance, and requires only
                 minimal changes to current operating systems. In our
                 scheme the compiler provides the crucial information on
                 future access patterns without burdening the
                 programmer; the operating system supports nonbinding
                 {\em prefetch\/} and {\em release\/} hints for managing
                 I/O; and the operating systems cooperates with a
                 run-time layer to accelerate performance by adapting to
                 dynamic behavior and minimizing prefetch overhead. This
                 approach maintains the abstraction of unlimited virtual
                 memory for the programmer, gives the compiler the
                 flexibility to aggressively insert prefetches ahead of
                 references, and gives the operating system the
                 flexibility to arbitrate between the competing resource
                 demands of multiple applications. We implemented our
                 compiler analysis within the SUIF compiler, and used it
                 to target implementations of our run-time and OS
                 support on both research and commercial systems
                 (Hurricane and IRIX 6.5, respectively). Our
                 experimental results show large performance gains for
                 out-of-core scientific applications on both systems:
                 more than 50\% of the I/O stall time has been eliminated
                 in most cases, thus translating into overall speedups
                 of roughly twofold in many cases.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Experimentation; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "compiler optimization; prefetching; virtual memory",
  subject =      "Software --- Operating Systems --- Storage Management
                 (D.4.2): {\bf Virtual memory}; Software --- Operating
                 Systems --- Performance (D.4.8); Software ---
                 Programming Languages --- Processors (D.3.4): {\bf
                 Compilers}; Software --- Programming Languages ---
                 Processors (D.3.4): {\bf Optimization}",
}

@Article{Fekete:2001:SUP,
  author =       "Alan Fekete and Nancy Lynch and Alex Shvartsman",
  title =        "Specifying and using a partitionable group
                 communication service",
  journal =      j-TOCS,
  volume =       "19",
  number =       "2",
  pages =        "171--216",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-2/p171-fekete/p171-fekete.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-2/p171-fekete/",
  abstract =     "Group communication services are becoming accepted as
                 effective building blocks for the construction of
                 fault-tolerant distributed applications. Many
                 specifications for group communication services have
                 been proposed. However, there is still no agreement
                 about what these specifications should say, especially
                 in cases where the services are {\em partitionable},
                 i.e., where communication failures may lead to
                 simultaneous creation of groups with disjoint
                 memberships, such that each group is unaware of the
                 existence of any other group. In this paper, we present
                 a new, succinct specification for a view-oriented
                 partitionable group communication service. The service
                 associates each message with a particular {\em view\/}
                 of the group membership. All send and receive events
                 for a message occur within the associated view. The
                 service provides a total order on the messages within
                 each view, and each processor receives a prefix of this
                 order. Our specification separates safety requirements
                 from performance and fault-tolerance requirements. The
                 safety requirements are expressed by an abstract,
                 global {\em state machine}. To present the performance
                 and fault-tolerance requirements, we include {\em
                 failure-status input actions\/} in the specification;
                 we then give properties saying that consensus on the
                 view and timely message delivery are guaranteed in an
                 execution provided that the execution {\em
                 stabilizes\/} to a situation in which the
                 failure-status stops changing and corresponds to
                 consistently partioned system. Because consensus is not
                 required in every execution, the specification is not
                 subject to the existing impossibility results for
                 partionable systems. Our specification has a simple
                 implementation, based on the membership algorithm of
                 Christian and Schmuck. We show the utility of the
                 specification by constructing an ordered-broadcast
                 application, using an algorithm (based on algorithms of
                 Amir, Dolev, Keidar, and others) that reconciles
                 information derived from different instantiations of
                 the group. The application manages the view-change
                 activity to build a shared sequence of messages, i.e.,
                 the per-view total orders of the group service are
                 combined to give a universal total order. We prove the
                 correctness and analyze the performance and
                 fault-tolerance of the resulting application.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Design; Performance; Verification",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "composable building blocks; conditional performance
                 analysis; distributed algorithms; group communication
                 protocols; message-passing protocols; ordered
                 broadcast; service specification; total-order
                 broadcast",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Distributed Systems
                 (C.2.4); Software --- Operating Systems --- Reliability
                 (D.4.5): {\bf Fault-tolerance}; Software --- Software
                 Engineering --- Software/Program Verification (D.2.4):
                 {\bf Correctness proofs}",
}

@Article{McNamee:2001:STT,
  author =       "Dylan McNamee and Jonathan Walpole and Calton Pu and
                 Crispin Cowan and Charles Krasic and Ashvin Goel and
                 Perry Wagle and Charles Consel and Gilles Muller and
                 Renauld Marlet",
  title =        "Specialization tools and techniques for systematic
                 optimization of system software",
  journal =      j-TOCS,
  volume =       "19",
  number =       "2",
  pages =        "217--251",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-2/p217-mcnamee/p217-mcnamee.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-2/p217-mcnamee/",
  abstract =     "Specialization has been recognized as a powerful
                 technique for optimizing operating systems. However,
                 specialization has not been broadly applied beyond the
                 research community because current techniques based on
                 manual specialization, are time-consuming and
                 error-prone. The goal of the work described in this
                 paper is to help operating system tuners perform
                 specialization more easily. We have built a
                 specialization toolkit that assists the major tasks of
                 specializing operating systems. We demonstrate the
                 effectiveness of the toolkit by applying it to three
                 diverse operating system components. We show that using
                 tools to assist specialization enables significant
                 performance optimizations without error-prone manual
                 modifications. Our experience with the toolkit suggests
                 new ways of designing systems that combine high
                 performance and clean structure.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "operating system specialization; optimization;
                 software architecture",
  subject =      "Software --- Operating Systems --- Organization and
                 Design (D.4.7)",
}

@Article{Mendelson:2001:ESC,
  author =       "Avi Mendelson and Freddy Gabbay",
  title =        "The effect of seance communication on multiprocessing
                 systems",
  journal =      j-TOCS,
  volume =       "19",
  number =       "2",
  pages =        "252--281",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/articles/journals/tocs/2001-19-2/p252-mendelson/p252-mendelson.pdf;
                 http://www.acm.org/pubs/citations/journals/tocs/2001-19-2/p252-mendelson/",
  abstract =     "This paper introduces the seance communication
                 phenomenon and analyzes its effect on a multiprocessing
                 environment. Seance communication is an unnecessary
                 coherency-related activity that is associated with dead
                 cache information. Dead information may reside in the
                 cache for various reasons: task migration, context
                 switches, or working-set changes. Dead information does
                 not have a significant performance impact on a
                 single-processor system; however, it can dominate the
                 performance of multicache environment. In order to
                 evaluate the overhead of seance communication, we
                 develop an analytical model that is based on the
                 fractal behavior of the memory references. So far, all
                 previous works that used the same modeling approach
                 extracted the fractal parameters of a program manually.
                 This paper provides an additional important
                 contribution by demonstrating how these parameters can
                 be automatically extracted from the program trace. Our
                 analysis indicates that Seance communication may
                 severely reduce the overall system performance when
                 using write-update or write-invalidate cache coherency
                 protocols. In addition, we find that the performance of
                 write-update protocols is affected more severely than
                 write-invalidate protocols. The results that are
                 provided by our model are important for better
                 understanding of the coherency-related overhead in
                 multicache systems and for better development of
                 parallel applications and operating systems.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Experimentation; Measurement; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "cache coherency protocols; multicache systems;
                 performance analysis; seance communication",
  subject =      "Hardware --- Memory Structures (B.3); Hardware ---
                 Memory Structures --- Design Styles (B.3.2): {\bf Cache
                 memories}; Computer Systems Organization --- General
                 (C.0); Computer Systems Organization --- Processor
                 Architectures --- Multiple Data Stream Architectures
                 (Multiprocessors) (C.1.2): {\bf Interconnection
                 architectures}",
}

@Article{Arpaci-Dusseau:2001:ICC,
  author =       "Andrea Carol Arpaci-Dusseau",
  title =        "Implicit coscheduling: coordinated scheduling with
                 implicit information in distributed systems",
  journal =      j-TOCS,
  volume =       "19",
  number =       "3",
  pages =        "283--331",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2001-19-3/p283-arpaci-dusseau/",
  abstract =     "In modern distributed systems, coordinated
                 time-sharing is required for communicating processes to
                 leverage the performance of switch-based networks and
                 low-overhead protocols. Coordinated time-sharing has
                 traditionally been achieved with gang scheduling or
                 explicit coscheduling, implementations of which often
                 suffer from many deficiencies: multiple points of
                 failure, high context-switch overheads, and poor
                 interaction with client-server, interactive, and
                 I/O-intensive workloads. {\em Implicit coscheduling\/}
                 dynamically coordinates communicating processes across
                 distributed machines without these structural
                 deficiencies. In implicit coscheduling, no
                 communication is required across operating system
                 schedulers; instead, cooperating processes achieve
                 coordination by reacting to {\em implicit
                 information\/} carried by communication existing within
                 the parallel application. The implementation of this
                 approach is simple and allows participating nodes to
                 act autonomously. We introduce two key mechanisms in
                 implicit coscheduling. The first is {\em conditional
                 two-phase waiting}, a generalization of traditional
                 two-phase waiting in which spin-time may be increased
                 depending upon events occurring while the process
                 waits. The second is an extension to stride scheduling
                 that provides preemption and is fair to processes that
                 block. To demonstrate that implicit coscheduling
                 performs well, we show that results from an extensive
                 set of simulation and implementation experiments. To
                 exercise the conditional two-phase waiting algorithm,
                 we examine three workloads: bulk-synchronous and
                 continuous-communication synthetic applications and
                 application kernels written in the Split-C language. To
                 exercise the local scheduler, we examine competing jobs
                 with different communication characteristics. We
                 demonstrate that our implementation scales well with
                 the number of jobs and workstations and is robust to
                 process placement. Our experiments show that implicit
                 coscheduling is effective and fair for a wide of
                 workloads; most perform within 30\% of an idealized
                 model of gang scheduling.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Design; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "clusters; coscheduling; gang scheduling; networks of
                 workstations; proportional-share scheduling; two-phase
                 waiting",
  subject =      "Software --- Operating Systems --- Process Management
                 (D.4.1): {\bf Scheduling}; Computer Systems
                 Organization --- Computer-Communication Networks ---
                 Distributed Systems (C.2.4): {\bf Network operating
                 systems}",
}

@Article{Carzaniga:2001:DEW,
  author =       "Antonio Carzaniga and David S. Rosenblum and Alexander
                 L. Wolf",
  title =        "Design and evaluation of a wide-area event
                 notification service",
  journal =      j-TOCS,
  volume =       "19",
  number =       "3",
  pages =        "332--383",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2001-19-3/p332-carzaniga/",
  abstract =     "The components of a loosely coupled system are
                 typically designed to operate by generating and
                 responding to asynchronous events. An {\em event
                 notification service\/} is an application-independent
                 infrastructure that supports the construction of
                 event-based systems, whereby generators of events
                 publish event notifications to the infrastructure and
                 consumers of events subscribe with the infrastructure
                 to receive relevant notification. The two primary
                 services that should be provided to components by the
                 infrastructure are notification selection (i.e.,
                 determining which notifications match which
                 subscriptions) and notification delivery (i.e., routing
                 matching notifications from publishers to subscribers).
                 Numerous event notification services have been
                 developed for local-area networks, generally based on a
                 centralized server to select and deliver event
                 notifications. Therefore, they suffer from an inherent
                 inability to scale to wide-area networks, such as the
                 internet, where the number and physical distribution of
                 the service's clients can quickly overwhelm a
                 centralized solution. The critical challenge in the
                 setting of a wide-area network is to maximize the
                 expressiveness in the selection mechanism without
                 sacrificing scalability in the delivery mechanism. This
                 paper presents Siena, an event notification service
                 that we have designed and implemented to exhibit both
                 expressiveness and scalability. We describe the
                 service's interface to applications, the algorithms
                 used by networks of servers to select and deliver event
                 notifications, and the strategies used to optimize
                 performance. We also present results of simulation
                 studies that examine the scalability and performance of
                 the service.",
  acknowledgement = ack-nhfb,
  generalterms = "Algorithms; Experimentation; Performance",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "content-based addressing and routing; event
                 notification; publish/subscribe",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Network
                 Architecture and Design (C.2.1): {\bf Distributed
                 networks}; Computer Systems Organization ---
                 Computer-Communication Networks --- Network Protocols
                 (C.2.2); Computer Systems Organization ---
                 Computer-Communication Networks --- Distributed Systems
                 (C.2.4): {\bf Distributed applications}; Computer
                 Systems Organization --- Computer-Communication
                 Networks --- Local and Wide-Area Networks (C.2.5): {\bf
                 Internet}; Computer Systems Organization ---
                 Computer-Communication Networks --- Internetworking
                 (C.2.6): {\bf Routers}; Computer Systems Organization
                 --- Performance of Systems (C.4): {\bf Design studies};
                 Computing Methodologies --- Simulation and Modeling ---
                 Applications (I.6.3); Computing Methodologies ---
                 Simulation and Modeling --- Model Validation and
                 Analysis (I.6.4); Computing Methodologies ---
                 Simulation and Modeling --- Types of Simulation
                 (I.6.8): {\bf Discrete event}",
}

@Article{Maxemchuk:2001:IMS,
  author =       "N. F. Maxemchuk and D. H. Shur",
  title =        "An {Internet} multicast system for the stock market",
  journal =      j-TOCS,
  volume =       "19",
  number =       "3",
  pages =        "384--412",
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jul 18 10:18:45 MDT 2001",
  bibsource =    "http://www.acm.org/pubs/toc/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "http://www.acm.org/pubs/citations/journals/tocs/2001-19-3/p384-maxemchuk/",
  abstract =     "We are moving toward an international, 24-hour,
                 distributed, electronic stock exchange. The exchange
                 will use the global internet, or Internet, technology.
                 This system is a natural application of multicast
                 because there are a large number of receivers that
                 should receive the same information simultaneously. The
                 data requirements for the stock exchange are discussed.
                 The current multi-cast protocols lack the reliability,
                 fairness, and scalability needed in this application.
                 We describe a distributed architecture and a timed
                 reliable multicast protocol, TRMP, that has the
                 appropriate characteristics. We consider three
                 applications: (1) A unified stock ticker of the
                 transactions that are being conducted on the various
                 physical and electronic exchanges. Our objective is to
                 deliver the same combined ticker reliably and
                 simultaneously to all receivers, anywhere in the world.
                 (2) A unified sequence of buy and sell offers that are
                 delivered to a single exchange or a collection of
                 exchanges. Our objectives is to give all traders the
                 same fair access to an exchange independent of their
                 relative distances to the exchange or the delay and
                 loss characteristics of the international network. (3)
                 A distributed, electronic trading floor that can
                 replace the current exchanges. This application has the
                 fairness attributes of the first two applications and
                 uses TRMP to conduct irrefutable, distributed trades.",
  acknowledgement = ack-nhfb,
  generalterms = "Design; Performance; Theory",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "multicast",
  subject =      "Computer Systems Organization ---
                 Computer-Communication Networks --- Network
                 Architecture and Design (C.2.1); Computer Systems
                 Organization --- Computer-Communication Networks ---
                 Network Protocols (C.2.2); Computer Systems
                 Organization --- Computer-Communication Networks ---
                 Distributed Systems (C.2.4)",
}

@Article{Collins:2001:RIC,
  author =       "Jamison D. Collins and Dean M. Tullsen",
  title =        "Runtime identification of cache conflict misses: {The}
                 adaptive miss buffer",
  journal =      j-TOCS,
  volume =       "19",
  number =       "4",
  pages =        "413--439",
  month =        nov,
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Feb 19 15:24:55 MST 2002",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Waldvogel:2001:SHS,
  author =       "Marcel Waldvogel and George Varghese and Jon Turner
                 and Bernhard Plattner",
  title =        "Scalable high-speed prefix matching",
  journal =      j-TOCS,
  volume =       "19",
  number =       "4",
  pages =        "440--482",
  month =        nov,
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Feb 19 15:24:55 MST 2002",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Alvarez:2001:MAR,
  author =       "Guillermo A. Alvarez and Elizabeth Borowsky and Susie
                 Go and Theodore H. Romer and Ralph Becker-Szendy and
                 Richard Golding and Arif Merchant and Mirjana
                 Spasojevic and Alistair Veitch and John Wilkes",
  title =        "{Minerva}: An automated resource provisioning tool
                 for large-scale storage systems",
  journal =      j-TOCS,
  volume =       "19",
  number =       "4",
  pages =        "483--518",
  month =        nov,
  year =         "2001",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Feb 19 15:24:55 MST 2002",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Fu:2002:FSD,
  author =       "Kevin Fu and M. Frans Kaashoek and David
                 Mazi{\`e}res",
  title =        "Fast and secure distributed read-only file system",
  journal =      j-TOCS,
  volume =       "20",
  number =       "1",
  pages =        "1--24",
  month =        feb,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:23 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Anderson:2002:IRR,
  author =       "Darrell C. Anderson and Jeffrey S. Chase and Amin M.
                 Vahdat",
  title =        "Interposed request routing for scalable network
                 storage",
  journal =      j-TOCS,
  volume =       "20",
  number =       "1",
  pages =        "25--48",
  month =        feb,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:23 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ganger:2002:FFA,
  author =       "Gregory R. Ganger and Dawson R. Engler and M. Frans
                 Kaashoek and H{\'e}ctor M. Brice{\~n}o and Russell Hunt
                 and Thomas Pinckney",
  title =        "Fast and flexible application-level networking on
                 exokernel systems",
  journal =      j-TOCS,
  volume =       "20",
  number =       "1",
  pages =        "49--83",
  month =        feb,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:23 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Satyanarayanan:2002:EC,
  author =       "M. Satyanarayanan",
  title =        "The evolution of {Coda}",
  journal =      j-TOCS,
  volume =       "20",
  number =       "2",
  pages =        "85--124",
  month =        may,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:24 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Burgess:2002:MSN,
  author =       "Mark Burgess and H{\aa}rek Haugerud and Sigmund
                 Straumsnes and Trond Reitan",
  title =        "Measuring system normality",
  journal =      j-TOCS,
  volume =       "20",
  number =       "2",
  pages =        "125--160",
  month =        may,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:24 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hu:2002:LCD,
  author =       "Zhigang Hu and Stefanos Kaxiras and Margaret
                 Martonosi",
  title =        "Let caches decay: reducing leakage energy via
                 exploitation of cache generational behavior",
  journal =      j-TOCS,
  volume =       "20",
  number =       "2",
  pages =        "161--190",
  month =        may,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:24 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Keidar:2002:MGM,
  author =       "Idit Keidar and Jeremy Sussman and Keith Marzullo and
                 Danny Dolev",
  title =        "{Moshe}: a group membership service for {WANs}",
  journal =      j-TOCS,
  volume =       "20",
  number =       "3",
  pages =        "191--238",
  month =        aug,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:17:48 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yu:2002:DEC,
  author =       "Haifeng Yu and Amin Vahdat",
  title =        "Design and evaluation of a conit-based continuous
                 consistency model for replicated services",
  journal =      j-TOCS,
  volume =       "20",
  number =       "3",
  pages =        "239--282",
  month =        aug,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:17:48 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "conit (consistency unit)",
}

@Article{Zdancewic:2002:SPP,
  author =       "Steve Zdancewic and Lantian Zheng and Nathaniel
                 Nystrom and Andrew C. Myers",
  title =        "Secure program partitioning",
  journal =      j-TOCS,
  volume =       "20",
  number =       "3",
  pages =        "283--328",
  month =        aug,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:17:48 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zhou:2002:CSD,
  author =       "Lidong Zhou and Fred B. Schneider and Robbert {Van
                 Renesse}",
  title =        "{COCA}: a secure distributed online certification
                 authority",
  journal =      j-TOCS,
  volume =       "20",
  number =       "4",
  pages =        "329--368",
  month =        nov,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:24 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jimenez:2002:NMD,
  author =       "Daniel A. Jim{\'e}nez and Calvin Lin",
  title =        "Neural methods for dynamic branch prediction",
  journal =      j-TOCS,
  volume =       "20",
  number =       "4",
  pages =        "369--397",
  month =        nov,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:24 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Castro:2002:PBF,
  author =       "Miguel Castro and Barbara Liskov",
  title =        "Practical {Byzantine} fault tolerance and proactive
                 recovery",
  journal =      j-TOCS,
  volume =       "20",
  number =       "4",
  pages =        "398--461",
  month =        nov,
  year =         "2002",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:24 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hu:2003:RTS,
  author =       "Y. Charlie Hu and Weimin Yu and Alan Cox and Dan
                 Wallach and Willy Zwaenepoel",
  title =        "Run-time support for distributed sharing in safe
                 languages",
  journal =      j-TOCS,
  volume =       "21",
  number =       "1",
  pages =        "1--35",
  month =        feb,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:21:30 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Arpaci-Dusseau:2003:RTA,
  author =       "Remzi H. Arpaci-Dusseau",
  title =        "Run-time adaptation in {River}",
  journal =      j-TOCS,
  volume =       "21",
  number =       "1",
  pages =        "36--86",
  month =        feb,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:21:30 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Nieh:2003:MTC,
  author =       "Jason Nieh and S. Jae Yang and Naomi Novik",
  title =        "Measuring thin-client performance using slow-motion
                 benchmarking",
  journal =      j-TOCS,
  volume =       "21",
  number =       "1",
  pages =        "87--115",
  month =        feb,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:21:30 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Nieh:2003:SSM,
  author =       "Jason Nieh and Monica S. Lam",
  title =        "A {SMART} scheduler for multimedia applications",
  journal =      j-TOCS,
  volume =       "21",
  number =       "2",
  pages =        "117--163",
  month =        may,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:25 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{VanRenesse:2003:ARS,
  author =       "Robbert {Van Renesse} and Kenneth P. Birman and Werner
                 Vogels",
  title =        "{Astrolabe}: a robust and scalable technology for
                 distributed system monitoring, management, and data
                 mining",
  journal =      j-TOCS,
  volume =       "21",
  number =       "2",
  pages =        "164--206",
  month =        may,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:25 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Harchol-Balter:2003:SBS,
  author =       "Mor Harchol-Balter and Bianca Schroeder and Nikhil
                 Bansal and Mukesh Agrawal",
  title =        "Size-based scheduling to improve {Web} performance",
  journal =      j-TOCS,
  volume =       "21",
  number =       "2",
  pages =        "207--233",
  month =        may,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:25 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ellis:2003:E,
  author =       "Carla Schlatter Ellis",
  title =        "Editorial",
  journal =      j-TOCS,
  volume =       "21",
  number =       "3",
  pages =        "235--235",
  month =        aug,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:26 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Castro:2003:BUA,
  author =       "Miguel Castro and Rodrigo Rodrigues and Barbara
                 Liskov",
  title =        "{BASE}: {Using} abstraction to improve fault
                 tolerance",
  journal =      j-TOCS,
  volume =       "21",
  number =       "3",
  pages =        "236--269",
  month =        aug,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:26 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Estan:2003:NDT,
  author =       "Cristian Estan and George Varghese",
  title =        "New directions in traffic measurement and accounting:
                 {Focusing} on the elephants, ignoring the mice",
  journal =      j-TOCS,
  volume =       "21",
  number =       "3",
  pages =        "270--313",
  month =        aug,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:26 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Swanson:2003:ESI,
  author =       "Steven Swanson and Luke K. McDowell and Michael M.
                 Swift and Susan J. Eggers and Henry M. Levy",
  title =        "An evaluation of speculative instruction execution on
                 simultaneous multithreaded processors",
  journal =      j-TOCS,
  volume =       "21",
  number =       "3",
  pages =        "314--340",
  month =        aug,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Aug 7 10:13:26 MDT 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Eugster:2003:LPB,
  author =       "P. Th. Eugster and R. Guerraoui and S. B. Handurukande
                 and P. Kouznetsov and A.-M. Kermarrec",
  title =        "Lightweight probabilistic broadcast",
  journal =      j-TOCS,
  volume =       "21",
  number =       "4",
  pages =        "341--374",
  month =        nov,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Oct 31 06:17:27 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hadzic:2003:BPF,
  author =       "Ilija Had{\v{z}}i{\'c} and Jonathan M. Smith",
  title =        "Balancing performance and flexibility with hardware
                 support for network architectures",
  journal =      j-TOCS,
  volume =       "21",
  number =       "4",
  pages =        "375--411",
  month =        nov,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Oct 31 06:17:27 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Annavaram:2003:CGP,
  author =       "Murali Annavaram and Jignesh M. Patel and Edward S.
                 Davidson",
  title =        "Call graph prefetching for database applications",
  journal =      j-TOCS,
  volume =       "21",
  number =       "4",
  pages =        "412--444",
  month =        nov,
  year =         "2003",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Oct 31 06:17:27 MST 2003",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Reumann:2004:SDI,
  author =       "John Reumann and Kang G. Shin",
  title =        "Stateful distributed interposition",
  journal =      j-TOCS,
  volume =       "22",
  number =       "1",
  pages =        "1--48",
  month =        feb,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Feb 2 14:07:29 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Olshefski:2004:UCI,
  author =       "David Olshefski and Jason Nieh and Dakshi Agrawal",
  title =        "Using {Certes} to infer client response time at the
                 {Web} server",
  journal =      j-TOCS,
  volume =       "22",
  number =       "1",
  pages =        "49--93",
  month =        feb,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Feb 2 14:07:29 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Certes (CliEnt Response Time Estimated by the
                 Server)",
}

@Article{Adve:2004:PPP,
  author =       "Vikram S. Adve and Mary K. Vernon",
  title =        "Parallel program performance prediction using
                 deterministic task graph analysis",
  journal =      j-TOCS,
  volume =       "22",
  number =       "1",
  pages =        "94--136",
  month =        feb,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Feb 2 14:07:29 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Flinn:2004:MBL,
  author =       "Jason Flinn and M. Satyanarayanan",
  title =        "Managing battery lifetime with energy-aware
                 adaptation",
  journal =      j-TOCS,
  volume =       "22",
  number =       "2",
  pages =        "137--179",
  month =        may,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 4 08:16:45 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ashok:2004:CCE,
  author =       "Raksit Ashok and Saurabh Chheda and Csaba Andras
                 Moritz",
  title =        "Coupling compiler-enabled and conventional memory
                 accessing for energy efficiency",
  journal =      j-TOCS,
  volume =       "22",
  number =       "2",
  pages =        "180--213",
  month =        may,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 4 08:16:45 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Choi:2004:GFP,
  author =       "Seungryul Choi and Nicholas Kohout and Sumit Pamnani
                 and Dongkeun Kim and Donald Yeung",
  title =        "A general framework for prefetch scheduling in linked
                 data structures and its application to multi-chain
                 prefetching",
  journal =      j-TOCS,
  volume =       "22",
  number =       "2",
  pages =        "214--280",
  month =        may,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 4 08:16:45 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Verstoep:2004:CCP,
  author =       "Kees Verstoep and Raoul A. F. Bhoedjang and Tim
                 R{\"u}hl and Henri E. Bal and Rutger F. H. Hofman",
  title =        "Cluster communication protocols for
                 parallel-programming systems",
  journal =      j-TOCS,
  volume =       "22",
  number =       "3",
  pages =        "281--325",
  month =        aug,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 4 08:16:45 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kim:2004:SSL,
  author =       "Dongkeun Kim and Donald Yeung",
  title =        "A study of source-level compiler algorithms for
                 automatic construction of pre-execution code",
  journal =      j-TOCS,
  volume =       "22",
  number =       "3",
  pages =        "326--379",
  month =        aug,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 4 08:16:45 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bartal:2004:FNF,
  author =       "Yair Bartal and Alain Mayer and Kobbi Nissim and
                 Avishai Wool",
  title =        "{{\em Firmato\/}}: a novel firewall management
                 toolkit",
  journal =      j-TOCS,
  volume =       "22",
  number =       "4",
  pages =        "381--420",
  month =        nov,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 2 05:29:12 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Grimm:2004:SSP,
  author =       "Robert Grimm and Janet Davis and Eric Lemar and Adam
                 Macbeth and Steven Swanson and Thomas Anderson and
                 Brian Bershad and Gaetano Borriello and Steven Gribble
                 and David Wetherall",
  title =        "System support for pervasive applications",
  journal =      j-TOCS,
  volume =       "22",
  number =       "4",
  pages =        "421--486",
  month =        nov,
  year =         "2004",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 2 05:29:12 MST 2004",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ellis:2005:E,
  author =       "Carla Schlatter Ellis",
  title =        "Editorial",
  journal =      j-TOCS,
  volume =       "23",
  number =       "1",
  pages =        "1--1",
  month =        feb,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Apr 14 10:29:37 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Maniatis:2005:LPP,
  author =       "Petros Maniatis and Mema Roussopoulos and T. J. Giuli
                 and David S. H. Rosenthal and Mary Baker",
  title =        "The {LOCKSS} peer-to-peer digital preservation
                 system",
  journal =      j-TOCS,
  volume =       "23",
  number =       "1",
  pages =        "2--50",
  month =        feb,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Apr 14 10:29:37 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{King:2005:BI,
  author =       "Samuel T. King and Peter M. Chen",
  title =        "Backtracking intrusions",
  journal =      j-TOCS,
  volume =       "23",
  number =       "1",
  pages =        "51--76",
  month =        feb,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Apr 14 10:29:37 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Swift:2005:IRC,
  author =       "Michael M. Swift and Brian N. Bershad and Henry M.
                 Levy",
  title =        "Improving the reliability of commodity operating
                 systems",
  journal =      j-TOCS,
  volume =       "23",
  number =       "1",
  pages =        "77--110",
  month =        feb,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Apr 14 10:29:37 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Gluhovsky:2005:CMC,
  author =       "Ilya Gluhovsky and Brian O'Krafka",
  title =        "Comprehensive multiprocessor cache miss rate
                 generation using multivariate models",
  journal =      j-TOCS,
  volume =       "23",
  number =       "2",
  pages =        "111--145",
  month =        may,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon May 9 11:20:41 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Herlihy:2005:NMM,
  author =       "Maurice Herlihy and Victor Luchangco and Paul Martin
                 and Mark Moir",
  title =        "Nonblocking memory management support for
                 dynamic-sized data structures",
  journal =      j-TOCS,
  volume =       "23",
  number =       "2",
  pages =        "146--196",
  month =        may,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon May 9 11:20:41 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jimenez:2005:ILA,
  author =       "Daniel A. Jim{\'e}nez",
  title =        "Improved latency and accuracy for neural branch
                 prediction",
  journal =      j-TOCS,
  volume =       "23",
  number =       "2",
  pages =        "197--218",
  month =        may,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon May 9 11:20:41 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jelasity:2005:GBA,
  author =       "M{\'a}rk Jelasity and Alberto Montresor and Ozalp
                 Babaoglu",
  title =        "{Gossip}-based aggregation in large dynamic networks",
  journal =      j-TOCS,
  volume =       "23",
  number =       "3",
  pages =        "219--252",
  month =        aug,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Nov 18 08:19:50 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Steffan:2005:SAT,
  author =       "J. Gregory Steffan and Christopher Colohan and Antonia
                 Zhai and Todd C. Mowry",
  title =        "The {STAMPede} approach to thread-level speculation",
  journal =      j-TOCS,
  volume =       "23",
  number =       "3",
  pages =        "253--300",
  month =        aug,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Nov 18 08:19:50 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kontothanassis:2005:SMC,
  author =       "Leonidas Kontothanassis and Robert Stets and Galen
                 Hunt and Umit Rencuzogullari and Gautam Altekar and
                 Sandhya Dwarkadas and Michael L. Scott",
  title =        "Shared memory computing on clusters with symmetric
                 multiprocessors and system area networks",
  journal =      j-TOCS,
  volume =       "23",
  number =       "3",
  pages =        "301--335",
  month =        aug,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Nov 18 08:19:50 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Anderson:2005:QFN,
  author =       "Eric Anderson and Susan Spence and Ram Swaminathan and
                 Mahesh Kallahalla and Qian Wang",
  title =        "Quickly finding near-optimal storage designs",
  journal =      j-TOCS,
  volume =       "23",
  number =       "4",
  pages =        "337--374",
  month =        nov,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 4 09:45:56 MST 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Patino-Martinez:2005:MRC,
  author =       "Marta Pati{\~n}o-Martinez and Ricardo
                 Jim{\'e}nez-Peris and Bettina Kemme and Gustavo
                 Alonso",
  title =        "{MIDDLE-R}: {Consistent} database replication at the
                 middleware level",
  journal =      j-TOCS,
  volume =       "23",
  number =       "4",
  pages =        "375--423",
  month =        nov,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 4 09:45:56 MST 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hsu:2005:AIL,
  author =       "Windsor W. Hsu and Alan Jay Smith and Honesty C.
                 Young",
  title =        "The automatic improvement of locality in storage
                 systems",
  journal =      j-TOCS,
  volume =       "23",
  number =       "4",
  pages =        "424--473",
  month =        nov,
  year =         "2005",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 4 09:45:56 MST 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Keromytis:2006:COS,
  author =       "Angelos D. Keromytis and Jason L. Wright and Theo {De
                 Raadt} and Matthew Burnside",
  title =        "Cryptography as an operating system service: a case
                 study",
  journal =      j-TOCS,
  volume =       "24",
  number =       "1",
  pages =        "1--38",
  month =        feb,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1124153.1124154",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 7 08:15:08 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Cryptographic transformations are a fundamental
                 building block in many security applications and
                 protocols. To improve performance, several vendors
                 market hardware accelerator cards. However, until now
                 no operating system provided a mechanism that allowed
                 both uniform and efficient use of this new type of
                 resource. We present the OpenBSD Cryptographic
                 Framework (OCF), a service virtualization layer
                 implemented inside the operating system kernel, that
                 provides uniform access to accelerator functionality by
                 hiding card-specific details behind a carefully
                 designed API. We evaluate the impact of the OCF in a
                 variety of benchmarks, measuring overall system
                 performance, application throughput and latency, and
                 aggregate throughput when multiple applications make
                 use of it. We conclude that the OCF is extremely
                 efficient in utilizing cryptographic accelerator
                 functionality, attaining 95\% of the theoretical peak
                 device performance and over 800 Mbps aggregate
                 throughput using 3DES. We believe that this validates
                 our decision to opt for ease of use by applications and
                 kernel components through a uniform API and for
                 seamless support for new accelerators. Furthermore, our
                 evaluation points to several bottlenecks in system and
                 operating system design: data copying between user and
                 kernel modes, PCI bus signaling inefficiency, protocols
                 that use small data units, and single-threaded
                 applications. We identify some of these limitations
                 through a set of measurements focusing on
                 application-layer cryptographic protocols such as SSL.
                 We offer several suggestions for improvements and
                 directions for future work. We provide experimental
                 evidence of the effectiveness of a new approach which
                 we call operating system shortcutting. Shortcutting can
                 improve the performance of application-layer
                 cryptographic protocols by 27\% with very small changes
                 to the kernel.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Coarfa:2006:PAT,
  author =       "Cristian Coarfa and Peter Druschel and Dan S.
                 Wallach",
  title =        "Performance analysis of {TLS Web} servers",
  journal =      j-TOCS,
  volume =       "24",
  number =       "1",
  pages =        "39--69",
  month =        feb,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1124153.1124155",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 7 08:15:08 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yu:2006:CLA,
  author =       "Haifeng Yu and Amin Vahdat",
  title =        "The costs and limits of availability for replicated
                 services",
  journal =      j-TOCS,
  volume =       "24",
  number =       "1",
  pages =        "70--113",
  month =        feb,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1124153.1124156",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 7 08:15:08 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Moore:2006:IID,
  author =       "David Moore and Colleen Shannon and Douglas J. Brown
                 and Geoffrey M. Voelker and Stefan Savage",
  title =        "Inferring {Internet} denial-of-service activity",
  journal =      j-TOCS,
  volume =       "24",
  number =       "2",
  pages =        "115--139",
  month =        may,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1132026.1132027",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 18 08:01:47 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "In this article, we seek to address a simple question:
                 ``How prevalent are denial-of-service attacks in the
                 Internet?'' Our motivation is to quantitatively
                 understand the nature of the current threat as well as
                 to enable longer-term analyses of trends and recurring
                 patterns of attacks. We present a new technique, called
                 ``backscatter analysis,'' that provides a conservative
                 estimate of worldwide denial-of-service activity. We
                 use this approach on 22 traces (each covering a week or
                 more) gathered over three years from 2001 through 2004.
                 Across this corpus we quantitatively assess the number,
                 duration, and focus of attacks, and qualitatively
                 characterize their behavior. In total, we observed over
                 68,000 attacks directed at over 34,000 distinct victim
                 IP addresses---ranging from well-known e-commerce
                 companies such as Amazon and Hotmail to small foreign
                 ISPs and dial-up connections. We believe our technique
                 is the first to provide quantitative estimates of
                 Internet-wide denial-of-service activity and that this
                 article describes the most comprehensive public
                 measurements of such activity to date.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Holman:2006:LUP,
  author =       "Philip Holman and James H. Anderson",
  title =        "Locking under {Pfair} scheduling",
  journal =      j-TOCS,
  volume =       "24",
  number =       "2",
  pages =        "140--174",
  month =        may,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1132026.1132028",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 18 08:01:47 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We present several locking synchronization protocols
                 for Pfair-scheduled multiprocessor systems. We focus on
                 two classes of protocols. The first class is only
                 applicable in systems in which all critical sections
                 are short relative to the length of the scheduling
                 quantum. In this case, efficient synchronization can be
                 achieved by ensuring that all locks have been released
                 before tasks are preempted. This is accomplished by
                 exploiting the quantum-based nature of Pfair
                 scheduling, which provides a priori knowledge of all
                 possible preemption points. The second and more general
                 protocol class is applicable to any system. For this
                 class, we consider the use of a client-server model. We
                 also discuss the viability of inheritance-based
                 protocols in Pfair-scheduled systems.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lai:2006:PWA,
  author =       "Albert M. Lai and Jason Nieh",
  title =        "On the performance of wide-area thin-client
                 computing",
  journal =      j-TOCS,
  volume =       "24",
  number =       "2",
  pages =        "175--209",
  month =        may,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1132026.1132029",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 18 08:01:47 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "While many application service providers have proposed
                 using thin-client computing to deliver computational
                 services over the Internet, little work has been done
                 to evaluate the effectiveness of thin-client computing
                 in a wide-area network. To assess the potential of
                 thin-client computing in the context of future
                 commodity high-bandwidth Internet access, we have used
                 a novel, noninvasive slow-motion benchmarking technique
                 to evaluate the performance of several popular
                 thin-client computing platforms in delivering
                 computational services cross-country over Internet2.
                 Our results show that using thin-client computing in a
                 wide-area network environment can deliver acceptable
                 performance over Internet2, even when client and server
                 are located thousands of miles apart on opposite ends
                 of the country. However, performance varies widely
                 among thin-client platforms and not all platforms are
                 suitable for this environment. While many thin-client
                 systems are touted as being bandwidth efficient, we
                 show that network latency is often the key factor in
                 limiting wide-area thin-client performance.
                 Furthermore, we show that the same techniques used to
                 improve bandwidth efficiency often result in worse
                 overall performance in wide-area networks. We
                 characterize and analyze the different design choices
                 in the various thin-client platforms and explain which
                 of these choices should be selected for supporting
                 wide-area computing services.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Vachharajani:2006:LSE,
  author =       "Manish Vachharajani and Neil Vachharajani and David A.
                 Penry and Jason A. Blome and Sharad Malik and David I.
                 August",
  title =        "The {Liberty Simulation Environment}: a deliberate
                 approach to high-level system modeling",
  journal =      j-TOCS,
  volume =       "24",
  number =       "3",
  pages =        "211--249",
  month =        aug,
  year =         "2006",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 29 05:29:09 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Barr:2006:EAL,
  author =       "Kenneth C. Barr and Krste Asanovi{\'c}",
  title =        "Energy-aware lossless data compression",
  journal =      j-TOCS,
  volume =       "24",
  number =       "3",
  pages =        "250--291",
  month =        aug,
  year =         "2006",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 29 05:29:09 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yuan:2006:EEC,
  author =       "Wanghong Yuan and Klara Nahrstedt",
  title =        "Energy-efficient {CPU} scheduling for multimedia
                 applications",
  journal =      j-TOCS,
  volume =       "24",
  number =       "3",
  pages =        "292--331",
  month =        aug,
  year =         "2006",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1151690.1151692",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 29 05:29:09 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Wireless transmission of a single bit can require over
                 1000 times more energy than a single computation. It
                 can therefore be beneficial to perform additional
                 computation to reduce the number of bits transmitted.
                 If the energy required to compress data is less than
                 the energy required to send it, there is a net energy
                 savings and an increase in battery life for portable
                 computers. This article presents a study of the energy
                 savings possible by losslessly compressing data prior
                 to transmission. A variety of algorithms were measured
                 on a StrongARM SA-110 processor. This work demonstrates
                 that, with several typical compression algorithms,
                 there is a actually a net energy increase when
                 compression is applied before transmission. Reasons for
                 this increase are explained and suggestions are made to
                 avoid it. One such energy-aware suggestion is
                 asymmetric compression, the use of one compression
                 algorithm on the transmit side and a different
                 algorithm for the receive path. By choosing the
                 lowest-energy compressor and decompressor on the test
                 platform, overall energy to send and receive data can
                 be reduced by 11\% compared with a well-chosen
                 symmetric pair, or up to 57\% over the default
                 symmetric {\tt zlib} scheme.",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Swift:2006:RDD,
  author =       "Michael M. Swift and Muthukaruppan Annamalai and Brian
                 N. Bershad and Henry M. Levy",
  title =        "Recovering device drivers",
  journal =      j-TOCS,
  volume =       "24",
  number =       "4",
  pages =        "333--360",
  month =        nov,
  year =         "2006",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:54 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Nightingale:2006:SED,
  author =       "Edmund B. Nightingale and Peter M. Chen and Jason
                 Flinn",
  title =        "Speculative execution in a distributed file system",
  journal =      j-TOCS,
  volume =       "24",
  number =       "4",
  pages =        "361--392",
  month =        nov,
  year =         "2006",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:54 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yang:2006:UMC,
  author =       "Junfeng Yang and Paul Twohey and Dawson Engler and
                 Madanlal Musuvathi",
  title =        "Using model checking to find serious file system
                 errors",
  journal =      j-TOCS,
  volume =       "24",
  number =       "4",
  pages =        "393--423",
  month =        nov,
  year =         "2006",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:54 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Higham:2007:SMC,
  author =       "Lisa Higham and Lillanne Jackson and Jalal Kawash",
  title =        "Specifying memory consistency of write buffer
                 multiprocessors",
  journal =      j-TOCS,
  volume =       "25",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:55 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Gluhovsky:2007:CME,
  author =       "Ilya Gluhovsky and David Vengerov and Brian O'Krafka",
  title =        "Comprehensive multivariate extrapolation modeling of
                 multiprocessor cache miss rates",
  journal =      j-TOCS,
  volume =       "25",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:55 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Behar:2007:TCS,
  author =       "Michael Behar and Avi Mendelson and Avinoam Kolodny",
  title =        "Trace cache sampling filter",
  journal =      j-TOCS,
  volume =       "25",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:55 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Swanson:2007:WA,
  author =       "Steven Swanson and Andrew Schwerin and Martha Mercaldi
                 and Andrew Petersen and Andrew Putnam and Ken Michelson
                 and Mark Oskin and Susan J. Eggers",
  title =        "The {WaveScalar} architecture",
  journal =      j-TOCS,
  volume =       "25",
  number =       "2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:56 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Fraser:2007:CPL,
  author =       "Keir Fraser and Tim Harris",
  title =        "Concurrent programming without locks",
  journal =      j-TOCS,
  volume =       "25",
  number =       "2",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:56 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Appavoo:2007:EDO,
  author =       "Jonathan Appavoo and Dilma {Da Silva} and Orran
                 Krieger and Marc Auslander and Michal Ostrowski and
                 Bryan Rosenburg and Amos Waterland and Robert W.
                 Wisniewski and Jimi Xenidis and Michael Stumm and Livio
                 Soares",
  title =        "Experience distributing objects in an {SMMP OS}",
  journal =      j-TOCS,
  volume =       "25",
  number =       "3",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:57 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Qin:2007:RTB,
  author =       "Feng Qin and Joseph Tucek and Yuanyuan Zhou and
                 Jagadeesan Sundaresan",
  title =        "Rx: {Treating} bugs as allergies---a safe method to
                 survive software failures",
  journal =      j-TOCS,
  volume =       "25",
  number =       "3",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:57 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jelasity:2007:GBP,
  author =       "M{\'a}rk Jelasity and Spyros Voulgaris and Rachid
                 Guerraoui and Anne-Marie Kermarrec and Maarten van
                 Steen",
  title =        "Gossip-based peer sampling",
  journal =      j-TOCS,
  volume =       "25",
  number =       "3",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2007",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 16:06:57 MST 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Xu:2007:MEE,
  author =       "Ruibin Xu and Daniel Moss{\'e} and Rami Melhem",
  title =        "Minimizing expected energy consumption in real-time
                 systems through dynamic voltage scaling",
  journal =      j-TOCS,
  volume =       "25",
  number =       "4",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2007",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1314299.1314300",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:15 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Many real-time systems, such as battery-operated
                 embedded devices, are energy constrained. A common
                 problem for these systems is how to reduce energy
                 consumption in the system as much as possible while
                 still meeting the deadlines; a commonly used power
                 management mechanism by these systems is dynamic
                 voltage scaling (DVS). Usually, the workloads executed
                 by these systems are variable and, more often than not,
                 unpredictable. Because of the unpredictability of the
                 workloads, one cannot guarantee to minimize the energy
                 consumption in the system. However, if the variability
                 of the workloads can be captured by the probability
                 distribution of the computational requirement of each
                 task in the system, it is possible to achieve the goal
                 of minimizing the expected energy consumption in the
                 system. In this paper, we investigate DVS schemes that
                 aim at minimizing expected energy consumption for
                 frame-based hard real-time systems. Our investigation
                 considers various DVS strategies (i.e., intra-task DVS,
                 inter-task DVS, and hybrid DVS) and both an ideal
                 system model (i.e., assuming unrestricted continuous
                 frequency, well-defined power-frequency relation, and
                 no speed change overhead) and a realistic system model
                 (i.e., the processor provides a set of discrete speeds,
                 no assumption is made on power-frequency relation, and
                 speed change overhead is considered). The highlights of
                 the investigation are two practical DVS schemes:
                 Practical PACE (PPACE) for a single task and Practical
                 Inter-Task DVS (PITDVS2) for general frame-based
                 systems. Evaluation results show that our proposed
                 schemes outperform and achieve significant energy
                 savings over existing schemes.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "dynamic voltage scaling; power management; processor
                 acceleration to conserve energy; real-time",
}

@Article{Hur:2007:MSM,
  author =       "Ibrahim Hur and Calvin Lin",
  title =        "Memory scheduling for modern microprocessors",
  journal =      j-TOCS,
  volume =       "25",
  number =       "4",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2007",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1314299.1314301",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:15 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The need to carefully schedule memory operations has
                 increased as memory performance has become increasingly
                 important to overall system performance. This article
                 describes the adaptive history-based (AHB) scheduler,
                 which uses the history of recently scheduled operations
                 to provide three conceptual benefits: (1) it allows the
                 scheduler to better reason about the delays associated
                 with its scheduling decisions, (2) it provides a
                 mechanism for combining multiple constraints, which is
                 important for increasingly complex DRAM structures, and
                 (3) it allows the scheduler to select operations so
                 that they match the program's mixture of Reads and
                 Writes, thereby avoiding certain bottlenecks within the
                 memory controller.\par

                 We have previously evaluated this scheduler in the
                 context of the IBM Power5. When compared with the state
                 of the art, this scheduler improves performance by
                 15.6\\%, 9.9\\%, and 7.6\\% for the Stream, NAS, and
                 commercial benchmarks, respectively. This article
                 expands our understanding of the AHB scheduler in a
                 variety of ways. Looking backwards, we describe the
                 scheduler in the context of prior work that focused
                 exclusively on avoiding bank conflicts, and we show
                 that the AHB scheduler is superior for the IBM Power5,
                 which we argue will be representative of future
                 microprocessor memory controllers. Looking forwards, we
                 evaluate this scheduler in the context of future
                 systems by varying a number of microarchitectural
                 features and hardware parameters. For example, we show
                 that the benefit of this scheduler increases as we move
                 to multithreaded environments.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "adaptive history-based scheduling; memory scheduling;
                 memory system performance",
}

@Article{Vandebogart:2007:LEP,
  author =       "Steve Vandebogart and Petros Efstathopoulos and Eddie
                 Kohler and Maxwell Krohn and Cliff Frey and David
                 Ziegler and Frans Kaashoek and Robert Morris and David
                 Mazi{\`e}res",
  title =        "Labels and event processes in the {Asbestos} operating
                 system",
  journal =      j-TOCS,
  volume =       "25",
  number =       "4",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2007",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1314299.1314302",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:15 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Asbestos, a new operating system, provides novel
                 labeling and isolation mechanisms that help contain the
                 effects of exploitable software flaws. Applications can
                 express a wide range of policies with Asbestos's
                 kernel-enforced labels, including controls on
                 interprocess communication and system-wide information
                 flow. A new event process abstraction defines
                 lightweight, isolated contexts within a single process,
                 allowing one process to act on behalf of multiple users
                 while preventing it from leaking any single user's data
                 to others. A Web server demonstration application uses
                 these primitives to isolate private user data. Since
                 the untrusted workers that respond to client requests
                 are constrained by labels, exploited workers cannot
                 directly expose user data except as allowed by
                 application policy. The server application requires 1.4
                 memory pages per user for up to 145,000 users and
                 achieves connection rates similar to Apache,
                 demonstrating that additional security can come at an
                 acceptable cost.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "information flow; labels; mandatory access control;
                 process abstractions; secure Web servers",
}

@Article{Coulson:2008:GCM,
  author =       "Geoff Coulson and Gordon Blair and Paul Grace and
                 Fran{\c{c}}ois Taiani and Ackbar Joolia and Kevin Lee
                 and Jo Ueyama and Thirunavukkarasu Sivaharan",
  title =        "A generic component model for building systems
                 software",
  journal =      j-TOCS,
  volume =       "26",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1328671.1328672",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:22 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Component-based software structuring principles are
                 now commonplace at the application level; but
                 componentization is far less established when it comes
                 to building low-level systems software. Although there
                 have been pioneering efforts in applying
                 componentization to systems-building, these efforts
                 have tended to target specific application domains
                 (e.g., embedded systems, operating systems,
                 communications systems, programmable networking
                 environments, or middleware platforms). They also tend
                 to be targeted at specific deployment environments
                 (e.g., standard personal computer (PC) environments,
                 network processors, or microcontrollers). The
                 disadvantage of this narrow targeting is that it fails
                 to maximize the genericity and abstraction potential of
                 the component approach. In this article, we argue for
                 the benefits and feasibility of a generic yet
                 tailorable approach to component-based systems-building
                 that offers a uniform programming model that is
                 applicable in a wide range of systems-oriented target
                 domains and deployment environments. The component
                 model, called OpenCom, is supported by a reflective
                 runtime architecture that is itself built from
                 components. After describing OpenCom and evaluating its
                 performance and overhead characteristics, we present
                 and evaluate two case studies of systems we have built
                 using OpenCom technology, thus illustrating its
                 benefits and its general applicability.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "component-based software; computer systems
                 implementation",
}

@Article{Colohan:2008:IPD,
  author =       "Christopher B. Colohan and Anastassia Ailamaki and J.
                 Gregory Steffan and Todd C. Mowry",
  title =        "Incrementally parallelizing database transactions with
                 thread-level speculation",
  journal =      j-TOCS,
  volume =       "26",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1328671.1328673",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:22 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "With the advent of chip multiprocessors, exploiting
                 intratransaction parallelism in database systems is an
                 attractive way of improving transaction performance.
                 However, exploiting intratransaction parallelism is
                 difficult for two reasons: first, significant changes
                 are required to avoid races or conflicts within the
                 DBMS; and second, adding threads to transactions
                 requires a high level of sophistication from
                 transaction programmers. In this article we show how
                 dividing a transaction into speculative threads solves
                 both problems --- it minimizes the changes required to
                 the DBMS, and the details of parallelization are hidden
                 from the transaction programmer. Our technique requires
                 a limited number of small, localized changes to a
                 subset of the low-level data structures in the DBMS.
                 Through this method of incrementally parallelizing
                 transactions, we can dramatically improve performance:
                 on a simulated four-processor chip-multiprocessor, we
                 improve the response time by 44--66\\% for three of the
                 five TPC-C transactions, assuming the availability of
                 idle processors.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "chip-multiprocessing; incremental parallelization;
                 optimistic concurrency; thread-level speculation",
}

@Article{Kostic:2008:HBD,
  author =       "Dejan Kosti{\'c} and Alex C. Snoeren and Amin Vahdat
                 and Ryan Braud and Charles Killian and James W.
                 Anderson and Jeannie Albrecht and Adolfo Rodriguez and
                 Erik Vandekieft",
  title =        "High-bandwidth data dissemination for large-scale
                 distributed systems",
  journal =      j-TOCS,
  volume =       "26",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1328671.1328674",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:22 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article focuses on the multireceiver data
                 dissemination problem. Initially, IP multicast formed
                 the basis for efficiently supporting such distribution.
                 More recently, overlay networks have emerged to support
                 point-to-multipoint communication. Both techniques
                 focus on constructing trees rooted at the source to
                 distribute content among all interested receivers. We
                 argue, however, that trees have two fundamental
                 limitations for data dissemination. First, since all
                 data comes from a single parent, participants must
                 often continuously probe in search of a parent with an
                 acceptable level of bandwidth. Second, due to packet
                 losses and failures, available bandwidth is
                 monotonically decreasing down the tree.\par

                 To address these limitations, we present Bullet, a data
                 dissemination mesh that takes advantage of the
                 computational and storage capabilities of end hosts to
                 create a distribution structure where a node receives
                 data in parallel from multiple peers. For the mesh to
                 deliver improved bandwidth and reliability, we need to
                 solve several key problems: (i) disseminating disjoint
                 data over the mesh, (ii) locating missing content,
                 (iii) finding who to peer with (peering strategy), (iv)
                 retrieving data at the right rate from all peers (flow
                 control), and (v) recovering from failures and adapting
                 to dynamically changing network conditions.
                 Additionally, the system should be self-adjusting and
                 should have few user-adjustable parameter settings. We
                 describe our approach to addressing all of these
                 problems in a working, deployed system across the
                 Internet. Bullet outperforms state-of-the-art systems,
                 including BitTorrent, by 25-70\\% and exhibits strong
                 performance and reliability in a range of deployment
                 settings. In addition, we find that, relative to
                 tree-based solutions, Bullet reduces the need to
                 perform expensive bandwidth probing.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "bandwidth; overlays; peer-to-peer",
}

@Article{Chang:2008:BDS,
  author =       "Fay Chang and Jeffrey Dean and Sanjay Ghemawat and
                 Wilson C. Hsieh and Deborah A. Wallach and Mike Burrows
                 and Tushar Chandra and Andrew Fikes and Robert E.
                 Gruber",
  title =        "{Bigtable}: a distributed storage system for
                 structured data",
  journal =      j-TOCS,
  volume =       "26",
  number =       "2",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1365815.1365816",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:30 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Bigtable is a distributed storage system for managing
                 structured data that is designed to scale to a very
                 large size: petabytes of data across thousands of
                 commodity servers. Many projects at Google store data
                 in Bigtable, including web indexing, Google Earth, and
                 Google Finance. These applications place very different
                 demands on Bigtable, both in terms of data size (from
                 URLs to web pages to satellite imagery) and latency
                 requirements (from backend bulk processing to real-time
                 data serving). Despite these varied demands, Bigtable
                 has successfully provided a flexible, high-performance
                 solution for all of these Google products. In this
                 article, we describe the simple data model provided by
                 Bigtable, which gives clients dynamic control over data
                 layout and format, and we describe the design and
                 implementation of Bigtable.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "large-scale distributed storage",
}

@Article{Bar-Yossef:2008:RRW,
  author =       "Ziv Bar-Yossef and Roy Friedman and Gabriel Kliot",
  title =        "{RaWMS} --- {Random Walk Based Lightweight Membership
                 Service} for Wireless Ad Hoc Networks",
  journal =      j-TOCS,
  volume =       "26",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1365815.1365817",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jun 16 17:52:30 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article presents RaWMS, a novel lightweight
                 random membership service for ad hoc networks. The
                 service provides each node with a partial uniformly
                 chosen view of network nodes. Such a membership service
                 is useful, for example, in data dissemination
                 algorithms, lookup and discovery services, peer
                 sampling services, and complete membership
                 construction. The design of RaWMS is based on a novel
                 reverse random walk (RW) sampling technique. The
                 article includes a formal analysis of both the reverse
                 RW sampling technique and RaWMS and verifies it through
                 a detailed simulation study. In addition, RaWMS is
                 compared both analytically and by simulations with a
                 number of other known methods such as flooding and
                 gossip-based techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "ad hoc networks; membership service; random walk",
}

@Article{Nightingale:2008:RS,
  author =       "Edmund B. Nightingale and Kaushik Veeraraghavan and
                 Peter M. Chen and Jason Flinn",
  title =        "Rethink the sync",
  journal =      j-TOCS,
  volume =       "26",
  number =       "3",
  pages =        "6:1--6:26",
  month =        sep,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1394441.1394442",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Sep 17 14:28:13 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We introduce {\em external synchrony}, a new model for
                 local file I/O that provides the reliability and
                 simplicity of synchronous I/O, yet also closely
                 approximates the performance of asynchronous I/O. An
                 external observer cannot distinguish the output of a
                 computer with an externally synchronous file system
                 from the output of a computer with a synchronous file
                 system. No application modification is required to use
                 an externally synchronous file system. In fact,
                 application developers can program to the simpler
                 synchronous I/O abstraction and still receive excellent
                 performance. We have implemented an externally
                 synchronous file system for Linux, called xsyncfs.
                 Xsyncfs provides the same durability and
                 ordering-guarantees as those provided by a {\em
                 synchronously\/} mounted ext3 file system. Yet even for
                 I/O-intensive benchmarks, xsyncfs performance is within
                 7\% of ext3 mounted {\em asynchronously}. Compared to
                 ext3 mounted synchronously, xsyncfs is up to two orders
                 of magnitude faster.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "causality; file systems; speculative execution;
                 synchronous I/O",
}

@Article{Agrawal:2008:AWS,
  author =       "Kunal Agrawal and Charles E. Leiserson and Yuxiong He
                 and Wen Jing Hsu",
  title =        "Adaptive work-stealing with parallelism feedback",
  journal =      j-TOCS,
  volume =       "26",
  number =       "3",
  pages =        "7:1--7:32",
  month =        sep,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1394441.1394443",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Sep 17 14:28:13 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Multiprocessor scheduling in a shared multiprogramming
                 environment can be structured as two-level scheduling,
                 where a kernel-level job scheduler allots processors to
                 jobs and a user-level thread scheduler schedules the
                 work of a job on its allotted processors. We present a
                 randomized work-stealing thread scheduler for fork-join
                 multithreaded jobs that provides continual parallelism
                 feedback to the job scheduler in the form of requests
                 for processors. Our A-STEAL algorithm is appropriate
                 for large parallel servers where many jobs share a
                 common multiprocessor resource and in which the number
                 of processors available to a particular job may vary
                 during the job's execution. Assuming that the job
                 scheduler never allots a job more processors than
                 requested by the job's thread scheduler, A-STEAL
                 guarantees that the job completes in near-optimal time
                 while utilizing at least a constant fraction of the
                 allotted processors.\par

                 We model the job scheduler as the thread scheduler's
                 adversary, challenging the thread scheduler to be
                 robust to the operating environment as well as to the
                 job scheduler's administrative policies. For example,
                 the job scheduler might make a large number of
                 processors available exactly when the job has little
                 use for them. To analyze the performance of our
                 adaptive thread scheduler under this stringent
                 adversarial assumption, we introduce a new technique
                 called {\em trim analysis,\/} which allows us to prove
                 that our thread scheduler performs poorly on no more
                 than a small number of time steps, exhibiting
                 near-optimal behavior on the vast majority.\par

                 More precisely, suppose that a job has work $T_1$ and
                 span $T_\infty$. On a machine with $P$ processors,
                 A-STEAL completes the job in an expected duration of
                 $O(T_1 / \tilde{P} + T_\infty + L \lg P)$ time steps,
                 where $L$ is the length of a scheduling quantum, and
                 $\tilde{P}$ denotes the $O(T_\infty + L \lg P)$-trimmed
                 availability. This quantity is the average of the
                 processor availability over all time steps except the
                 $O(T_\infty + L \lg P)$ time steps that have the
                 highest processor availability. When the job's
                 parallelism dominates the trimmed availability, that
                 is, $\tilde{P} \ll T_1 / T_\infty$, the job achieves
                 nearly perfect linear speedup. Conversely, when the
                 trimmed mean dominates the parallelism, the asymptotic
                 running time of the job is nearly the length of its
                 span, which is optimal.\par

                 We measured the performance of A-STEAL on a simulated
                 multiprocessor system using synthetic workloads. For
                 jobs with sufficient parallelism, our experiments
                 confirm that A-STEAL provides almost perfect linear
                 speedup across a variety of processor availability
                 profiles. We compared A-STEAL with the ABP algorithm,
                 an adaptive work-stealing thread scheduler developed by
                 Arora et al. [1998] which does not employ parallelism
                 feedback. On moderately to heavily loaded machines with
                 large numbers of processors, A-STEAL typically
                 completed jobs more than twice as quickly as ABP,
                 despite being allotted the same number or fewer
                 processors on every step, while wasting only 10\% of
                 the processor cycles wasted by ABP.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "adaptive scheduling; adversary; instantaneous
                 parallelism; job scheduling; multiprocessing;
                 multiprogramming; parallel computation; parallelism
                 feedback; processor allocation; randomized algorithm;
                 space sharing; span; thread scheduling; trim analysis;
                 two-level scheduling; work; work-stealing",
}

@Article{Shieh:2008:SAC,
  author =       "Alan Shieh and Andrew C. Myers and Emin G{\"u}n
                 Sirer",
  title =        "A stateless approach to connection-oriented
                 protocols",
  journal =      j-TOCS,
  volume =       "26",
  number =       "3",
  pages =        "8:1--8:50",
  month =        sep,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1394441.1394444",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Sep 17 14:28:13 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Traditional operating system interfaces and network
                 protocol implementations force some system state to be
                 kept on both sides of a connection. This state ties the
                 connection to its endpoints, impedes transparent
                 failover, permits denial-of-service attacks, and limits
                 scalability. This article introduces a novel TCP-like
                 transport protocol and a new interface to replace
                 sockets that together enable all state to be kept on
                 one endpoint, allowing the other endpoint, typically
                 the server, to operate without any per-connection
                 state. Called {\em Trickles}, this approach enables
                 servers to scale well with increasing numbers of
                 clients, consume fewer resources, and better resist
                 denial-of-service attacks. Measurements on a full
                 implementation in Linux indicate that Trickles achieves
                 performance comparable to TCP/IP, interacts well with
                 other flows, and scales well. Trickles also enables
                 qualitatively different kinds of networked services.
                 Services can be geographically replicated and contacted
                 through an anycast primitive for improved availability
                 and performance. Widely-deployed practices that
                 currently have client-observable side effects, such as
                 periodic server reboots, connection redirection, and
                 failover, can be made transparent, and perform well,
                 under Trickles. The protocol is secure against
                 tampering and replay attacks, and the client interface
                 is backward-compatible, requiring no changes to
                 sockets-based client applications.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "stateless interfaces; stateless protocols",
}

@Article{Costa:2008:VEE,
  author =       "Manuel Costa and Jon Crowcroft and Miguel Castro and
                 Antony Rowstron and Lidong Zhou and Lintao Zhang and
                 Paul Barham",
  title =        "{Vigilante}: End-to-end containment of {Internet} worm
                 epidemics",
  journal =      j-TOCS,
  volume =       "26",
  number =       "4",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1455258.1455259",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Dec 23 13:36:21 MST 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Worm containment must be automatic because worms can
                 spread too fast for humans to respond. Recent work
                 proposed network-level techniques to automate worm
                 containment; these techniques have limitations because
                 there is no information about the vulnerabilities
                 exploited by worms at the network level. We propose
                 Vigilante, a new end-to-end architecture to contain
                 worms automatically that addresses these
                 limitations.\par

                 In Vigilante, hosts detect worms by instrumenting
                 vulnerable programs to analyze infection attempts. We
                 introduce {\em dynamic data-flow analysis\/}: a
                 broad-coverage host-based algorithm that can detect
                 unknown worms by tracking the flow of data from network
                 messages and disallowing unsafe uses of this data. We
                 also show how to integrate other host-based detection
                 mechanisms into the Vigilante architecture. Upon
                 detection, hosts generate {\em self-certifying
                 alerts\/} (SCAs), a new type of security alert that can
                 be inexpensively verified by any vulnerable host. Using
                 SCAs, hosts can cooperate to contain an outbreak,
                 without having to trust each other. Vigilante
                 broadcasts SCAs over an overlay network that propagates
                 alerts rapidly and resiliently. Hosts receiving an SCA
                 protect themselves by generating filters with {\em
                 vulnerability condition slicing\/}: an algorithm that
                 performs dynamic analysis of the vulnerable program to
                 identify control-flow conditions that lead to
                 successful attacks. These filters block the worm attack
                 and all its polymorphic mutations that follow the
                 execution path identified by the SCA.\par

                 Our results show that Vigilante can contain
                 fast-spreading worms that exploit unknown
                 vulnerabilities, and that Vigilante's filters introduce
                 a negligible performance overhead. Vigilante does not
                 require any changes to hardware, compilers, operating
                 systems, or the source code of vulnerable programs;
                 therefore, it can be used to protect current software
                 binaries.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "dynamic data-flow analysis; program analysis;
                 self-certifying alerts; vulnerability condition
                 slicing; Worm containment",
}

@Article{Qiao:2008:IPP,
  author =       "Yi Qiao and Fabi{\'a}n E. Bustamante and Peter A.
                 Dinda and Stefan Birrer and Dong Lu",
  title =        "Improving peer-to-peer performance through server-side
                 scheduling",
  journal =      j-TOCS,
  volume =       "26",
  number =       "4",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2008",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1455258.1455260",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Dec 23 13:36:21 MST 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We show how to significantly improve the mean response
                 time seen by both uploaders and downloaders in
                 peer-to-peer data-sharing systems. Our work is
                 motivated by the observation that response times are
                 largely determined by the performance of the peers
                 serving the requested objects, that is, by the peers in
                 their capacity as servers. With this in mind, we take a
                 close look at this {\em server side\/} of peers,
                 characterizing its workload by collecting and examining
                 an extensive set of traces. Using trace-driven
                 simulation, we demonstrate the promise and potential
                 problems with scheduling policies based on
                 shortest-remaining-processing-time (SRPT), the
                 algorithm known to be optimal for minimizing mean
                 response time. The key challenge to using SRPT in this
                 context is determining request service times. In
                 addressing this challenge, we introduce two new
                 estimators that enable {\em predictive\/} SRPT
                 scheduling policies that closely approach the
                 performance of ideal SRPT. We evaluate our approach
                 through extensive single-server and system-level
                 simulation coupled with real Internet deployment and
                 experimentation.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Peer-to-peer; scheduling; server-side; size-based
                 scheduling; SRPT",
}

@Article{Choi:2009:HCS,
  author =       "Seungryul Choi and Donald Yeung",
  title =        "Hill-climbing {SMT} processor resource distribution",
  journal =      j-TOCS,
  volume =       "27",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2009",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Feb 13 18:30:25 MST 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The key to high performance in Simultaneous
                 MultiThreaded (SMT) processors lies in optimizing the
                 distribution of shared resources to active threads.
                 Existing resource distribution techniques optimize
                 performance only indirectly. They infer potential
                 performance bottlenecks by observing indicators, like
                 instruction occupancy or cache miss counts, and take
                 actions to try to alleviate them. While the corrective
                 actions are designed to improve performance, their
                 actual performance impact is not known since end
                 performance is never monitored. Consequently, potential
                 performance gains are lost whenever the corrective
                 actions do not effectively address the actual
                 bottlenecks occurring in the pipeline.\par

                 We propose a different approach to SMT resource
                 distribution that optimizes end performance directly.
                 Our approach observes the impact that resource
                 distribution decisions have on performance at runtime,
                 and feeds this information back to the resource
                 distribution mechanisms to improve future decisions. By
                 evaluating many different resource distributions, our
                 approach tries to learn the best distribution over
                 time. Because we perform learning online, learning time
                 is crucial. We develop a hill-climbing algorithm that
                 quickly learns the best distribution of resources by
                 following the performance gradient within the resource
                 distribution space. We also develop several ideal
                 learning algorithms to enable deeper insights through
                 limit studies.\par

                 This article conducts an in-depth investigation of
                 hill-climbing SMT resource distribution using a
                 comprehensive suite of 63 multiprogrammed workloads.
                 Our results show hill-climbing outperforms ICOUNT,
                 FLUSH, and DCRA (three existing SMT techniques) by
                 11.4\%, 11.5\%, and 2.8\%, respectively, under the
                 weighted IPC metric. A limit study conducted using our
                 ideal learning algorithms shows our approach can
                 potentially outperform the same techniques by 19.2\%,
                 18.0\%, and 7.6\%, respectively, thus demonstrating
                 additional room exists for further improvement. Using
                 our ideal algorithms, we also identify three
                 bottlenecks that limit online learning speed: local
                 maxima, phased behavior, and interepoch jitter. We
                 define metrics to quantify these learning bottlenecks,
                 and characterize the extent to which they occur in our
                 workloads. Finally, we conduct a sensitivity study, and
                 investigate several extensions to improve our
                 hill-climbing technique.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ntarmos:2009:DHS,
  author =       "N. Ntarmos and P. Triantafillou and G. Weikum",
  title =        "Distributed hash sketches: {Scalable}, efficient, and
                 accurate cardinality estimation for distributed
                 multisets",
  journal =      j-TOCS,
  volume =       "27",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2009",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Feb 13 18:30:25 MST 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Counting items in a distributed system, and estimating
                 the cardinality of multisets in particular, is
                 important for a large variety of applications and a
                 fundamental building block for emerging Internet-scale
                 information systems. Examples of such applications
                 range from optimizing query access plans in
                 peer-to-peer data sharing, to computing the
                 significance (rank/score) of data items in distributed
                 information retrieval. The general formal problem
                 addressed in this article is computing the network-wide
                 distinct number of items with some property (e.g.,
                 distinct files with file name containing ``spiderman'')
                 where each node in the network holds an arbitrary
                 subset, possibly overlapping the subsets of other
                 nodes. The key requirements that a viable approach must
                 satisfy are: (1) scalability towards very large network
                 size, (2) efficiency regarding messaging overhead, (3)
                 load balance of storage and access, (4) accuracy of the
                 cardinality estimation, and (5) simplicity and easy
                 integration in applications. This article contributes
                 the DHS (Distributed Hash Sketches) method for this
                 problem setting: a distributed, scalable, efficient,
                 and accurate multiset cardinality estimator. DHS is
                 based on hash sketches for probabilistic counting, but
                 distributes the bits of each counter across network
                 nodes in a judicious manner based on principles of
                 Distributed Hash Tables, paying careful attention to
                 fast access and aggregation as well as update costs.
                 The article discusses various design choices,
                 exhibiting tunable trade-offs between estimation
                 accuracy, hop-count efficiency, and load distribution
                 fairness. We further contribute a full-fledged,
                 publicly available, open-source implementation of all
                 our methods, and a comprehensive experimental
                 evaluation for various settings.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Eyerman:2009:MPM,
  author =       "Stijn Eyerman and Lieven Eeckhout and Tejas Karkhanis
                 and James E. Smith",
  title =        "A mechanistic performance model for superscalar
                 out-of-order processors",
  journal =      j-TOCS,
  volume =       "27",
  number =       "2",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1534909.1534910",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed May 27 15:56:17 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "A mechanistic model for out-of-order superscalar
                 processors is developed and then applied to the study
                 of microarchitecture resource scaling. The model
                 divides execution time into intervals separated by
                 disruptive miss events such as branch mispredictions
                 and cache misses. Each type of miss event results in
                 characterizable performance behavior for the execution
                 time interval. By considering an interval's type and
                 length (measured in instructions), execution time can
                 be predicted for the interval. Overall execution time
                 is then determined by aggregating the execution time
                 over all intervals. The mechanistic model provides
                 several advantages over prior modeling approaches, and,
                 when estimating performance, it differs from detailed
                 simulation of a 4-wide out-of-order processor by an
                 average of 7\%.\par

                 The mechanistic model is applied to the general problem
                 of resource scaling in out-of-order superscalar
                 processors. First, we use the model to determine size
                 relationships among microarchitecture structures in a
                 balanced processor design. Second, we use the
                 mechanistic model to study scaling of both pipeline
                 depth and width in balanced processor designs. We
                 corroborate previous results in this area and provide
                 new results. For example, we show that at optimal
                 design points, the pipeline depth times the square root
                 of the processor width is nearly constant. Finally, we
                 consider the behavior of unbalanced, overprovisioned
                 processor designs based on insight gained from the
                 mechanistic model. We show that in certain situations
                 an overprovisioned processor may lead to improved
                 overall performance. Designs where a processor's
                 dispatch width is wider than its issue width are of
                 particular interest.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "analytical modeling; balanced processor design;
                 mechanistic modeling; overprovisioned processor design;
                 performance modeling; pipeline depth; pipeline width;
                 resource scaling; Superscalar out-of-order processor;
                 wide front-end dispatch processors",
}

@Article{Zagorodnov:2009:PLO,
  author =       "Dmitrii Zagorodnov and Keith Marzullo and Lorenzo
                 Alvisi and Thomas C. Bressoud",
  title =        "Practical and low-overhead masking of failures of
                 {TCP}-based servers",
  journal =      j-TOCS,
  volume =       "27",
  number =       "2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1534909.1534911",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed May 27 15:56:17 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article describes an architecture that allows a
                 replicated service to survive crashes without breaking
                 its TCP connections. Our approach does not require
                 modifications to the TCP protocol, to the operating
                 system on the server, or to any of the software running
                 on the clients. Furthermore, it runs on commodity
                 hardware. We compare two implementations of this
                 architecture (one based on primary/backup replication
                 and another based on message logging) focusing on
                 scalability, failover time, and application
                 transparency. We evaluate three types of services: a
                 file server, a Web server, and a multimedia streaming
                 server. Our experiments suggest that the approach
                 incurs low overhead on throughput, scales well as the
                 number of clients increases, and allows recovery of the
                 service in near-optimal time.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Fault-tolerant computing system; primary/backup
                 approach; TCP/IP",
}

@Article{Aguilera:2009:SNP,
  author =       "Marcos K. Aguilera and Arif Merchant and Mehul Shah
                 and Alistair Veitch and Christos Karamanolis",
  title =        "{Sinfonia}: a new paradigm for building scalable
                 distributed systems",
  journal =      j-TOCS,
  volume =       "27",
  number =       "3",
  pages =        "5:1--5:48",
  month =        nov,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1629087.1629088",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Mar 15 09:06:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We propose a new paradigm for building scalable
                 distributed systems. Our approach does not require
                 dealing with message-passing protocols, a major
                 complication in existing distributed systems. Instead,
                 developers just design and manipulate data structures
                 within our service called Sinfonia. Sinfonia keeps data
                 for applications on a set of memory nodes, each
                 exporting a linear address space. At the core of
                 Sinfonia is a new minitransaction primitive that
                 enables efficient and consistent access to data, while
                 hiding the complexities that arise from concurrency and
                 failures. Using Sinfonia, we implemented two very
                 different and complex applications in a few months: a
                 cluster file system and a group communication service.
                 Our implementations perform well and scale to hundreds
                 of machines.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Distributed systems; fault tolerance; scalability;
                 shared memory; transactions; two-phase commit",
}

@Article{Cherkasova:2009:AAD,
  author =       "Ludmila Cherkasova and Kivanc Ozonat and Ningfang Mi
                 and Julie Symons and Evgenia Smirni",
  title =        "Automated anomaly detection and performance modeling
                 of enterprise applications",
  journal =      j-TOCS,
  volume =       "27",
  number =       "3",
  pages =        "6:1--6:32",
  month =        nov,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1629087.1629089",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Mar 15 09:06:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Automated tools for understanding application behavior
                 and its changes during the application lifecycle are
                 essential for many performance analysis and debugging
                 tasks. Application performance issues have an immediate
                 impact on customer experience and satisfaction. A
                 sudden slowdown of enterprise-wide application can
                 effect a large population of customers, lead to delayed
                 projects, and ultimately can result in company
                 financial loss. Significantly shortened time between
                 new software releases further exacerbates the problem
                 of thoroughly evaluating the performance of an updated
                 application. Our thesis is that online performance
                 modeling should be a part of routine application
                 monitoring. Early, informative warnings on significant
                 changes in application performance should help service
                 providers to timely identify and prevent performance
                 problems and their negative impact on the service. We
                 propose a novel framework for automated anomaly
                 detection and application change analysis. It is based
                 on integration of two complementary techniques: (i) a
                 regression-based transaction model that reflects a
                 resource consumption model of the application, and (ii)
                 an application performance signature that provides a
                 compact model of runtime behavior of the application.
                 The proposed integrated framework provides a simple and
                 powerful solution for anomaly detection and analysis of
                 essential performance changes in application behavior.
                 An additional benefit of the proposed approach is its
                 simplicity: It is not intrusive and is based on
                 monitoring data that is typically available in
                 enterprise production environments. The introduced
                 solution further enables the automation of capacity
                 planning and resource provisioning tasks of multitier
                 applications in rapidly evolving IT environments.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Anomaly detection; capacity planning; multitier
                 applications; online algorithms; performance modeling",
}

@Article{Kotla:2009:ZSB,
  author =       "Ramakrishna Kotla and Lorenzo Alvisi and Mike Dahlin
                 and Allen Clement and Edmund Wong",
  title =        "{Zyzzyva}: {Speculative Byzantine} fault tolerance",
  journal =      j-TOCS,
  volume =       "27",
  number =       "4",
  pages =        "7:1--7:39",
  month =        dec,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1658357.1658358",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Mar 15 09:06:46 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "A longstanding vision in distributed systems is to
                 build reliable systems from unreliable components. An
                 enticing formulation of this vision is Byzantine
                 Fault-Tolerant (BFT) state machine replication, in
                 which a group of servers collectively act as a correct
                 server even if some of the servers misbehave or
                 malfunction in arbitrary (``Byzantine'') ways. Despite
                 this promise, practitioners hesitate to deploy BFT
                 systems, at least partly because of the perception that
                 BFT must impose high overheads.\par

                 In this article, we present Zyzzyva, a protocol that
                 uses speculation to reduce the cost of BFT replication.
                 In Zyzzyva, replicas reply to a client's request
                 without first running an expensive three-phase commit
                 protocol to agree on the order to process requests.
                 Instead, they optimistically adopt the order proposed
                 by a primary server, process the request, and reply
                 immediately to the client. If the primary is faulty,
                 replicas can become temporarily inconsistent with one
                 another, but clients detect inconsistencies, help
                 correct replicas converge on a single total ordering of
                 requests, and only rely on responses that are
                 consistent with this total order. This approach allows
                 Zyzzyva to reduce replication overheads to near their
                 theoretical minima and to achieve throughputs of tens
                 of thousands of requests per second, making BFT
                 replication practical for a broad range of demanding
                 services.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Byzantine fault tolerance; output commit; replication;
                 speculative execution",
}

@Article{Vera:2009:SRL,
  author =       "Xavier Vera and Jaume Abella and Javier Carretero and
                 Antonio Gonz{\'a}lez",
  title =        "Selective replication: a lightweight technique for
                 soft errors",
  journal =      j-TOCS,
  volume =       "27",
  number =       "4",
  pages =        "8:1--8:30",
  month =        dec,
  year =         "2009",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1658357.1658359",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Mar 15 09:06:46 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Soft errors are an important challenge in contemporary
                 microprocessors. Modern processors have caches and
                 large memory arrays protected by parity or error
                 detection and correction codes. However, today's
                 failure rate is dominated by flip flops, latches, and
                 the increasing sensitivity of combinational logic to
                 particle strikes. Moreover, as Chip Multi-Processors
                 (CMPs) become ubiquitous, meeting the FIT budget for
                 new designs is becoming a major
                 challenge.\par

                 Solutions based on replicating threads have been
                 explored deeply; however, their high cost in
                 performance and energy make them unsuitable for current
                 designs. Moreover, our studies based on a typical
                 configuration for a modern processor show that focusing
                 on the top 5 most vulnerable structures can provide up
                 to 70\% reduction in FIT rate. Therefore, full
                 replication may overprotect the chip by reducing the
                 FIT much below budget.\par

                 We propose {\em Selective Replication}, a
                 lightweight-reconfigurable mechanism that achieves a
                 high FIT reduction by protecting the most vulnerable
                 instructions with minimal performance and energy
                 impact. Low performance degradation is achieved by not
                 requiring additional issue slots and reissuing
                 instructions only during the time window between when
                 they are retirable and they actually retire. Coverage
                 can be reconfigured online by replicating only a subset
                 of the instructions (the most vulnerable ones).
                 Instructions' vulnerability is estimated based on the
                 area they occupy and the time they spend in the issue
                 queue. By changing the vulnerability threshold, we can
                 adjust the trade-off between coverage and performance
                 loss.\par

                 Results for an out-of-order processor configured
                 similarly to Intel{\reg} Core\TM{} Micro-Architecture
                 show that our scheme can achieve over 65\% FIT
                 reduction with less than 4\% performance degradation
                 with small area and complexity overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "AVF prediction; FIT reduction; redundant
                 multithreading; Soft errors",
}

@Article{Chen:2010:E,
  author =       "Peter M. Chen",
  title =        "Editorial",
  journal =      j-TOCS,
  volume =       "28",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1731060.1731061",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Apr 5 12:44:43 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yabandeh:2010:PPI,
  author =       "Maysam Yabandeh and Nikola Kne{\v{z}}evi{\'c} and
                 Dejan Kosti{\'c} and Viktor Kuncak",
  title =        "Predicting and preventing inconsistencies in deployed
                 distributed systems",
  journal =      j-TOCS,
  volume =       "28",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1731060.1731062",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Apr 5 12:44:43 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We propose a new approach for developing and deploying
                 distributed systems, in which nodes predict distributed
                 consequences of their actions and use this information
                 to detect and avoid errors. Each node continuously runs
                 a state exploration algorithm on a recent consistent
                 snapshot of its neighborhood and predicts possible
                 future violations of specified safety properties. We
                 describe a new state exploration algorithm, consequence
                 prediction, which explores causally related chains of
                 events that lead to property violation.\par

                 This article describes the design and implementation of
                 this approach, termed CrystalBall. We evaluate
                 CrystalBall on RandTree, BulletPrime, Paxos, and Chord
                 distributed system implementations. We identified new
                 bugs in mature Mace implementations of three systems.
                 Furthermore, we show that if the bug is not corrected
                 during system development, CrystalBall is effective in
                 steering the execution away from inconsistent states at
                 runtime.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "consequence prediction; Distributed systems; enforcing
                 safety properties; execution steering; reliability",
}

@Article{Walfish:2010:DDO,
  author =       "Michael Walfish and Mythili Vutukuru and Hari
                 Balakrishnan and David Karger and Scott Shenker",
  title =        "{DDoS} defense by offense",
  journal =      j-TOCS,
  volume =       "28",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1731060.1731063",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Apr 5 12:44:43 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article presents the design, implementation,
                 analysis, and experimental evaluation of {\em
                 speak-up}, a defense against {\em application-level\/}
                 distributed denial-of-service (DDoS), in which
                 attackers cripple a server by sending
                 legitimate-looking requests that consume computational
                 resources (e.g., CPU cycles, disk). With speak-up, a
                 victimized server encourages all clients, resources
                 permitting, {\em to automatically send higher volumes
                 of traffic}. We suppose that attackers are already
                 using most of their upload bandwidth so cannot react to
                 the encouragement. Good clients, however, have spare
                 upload bandwidth so can react to the encouragement with
                 drastically higher volumes of traffic. The intended
                 outcome of this traffic inflation is that the good
                 clients crowd out the bad ones, thereby capturing a
                 much larger fraction of the server's resources than
                 before. We experiment under various conditions and find
                 that speak-up causes the server to spend resources on a
                 group of clients in rough proportion to their aggregate
                 upload bandwidths, which is the intended result.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "bandwidth; currency; DoS attack",
}

@Article{Roeder:2010:PO,
  author =       "Tom Roeder and Fred B. Schneider",
  title =        "Proactive obfuscation",
  journal =      j-TOCS,
  volume =       "28",
  number =       "2",
  pages =        "4:1--4:??",
  month =        jul,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1813654.1813655",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jul 22 12:42:28 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "{\em Proactive obfuscation\/} is a new method for
                 creating server replicas that are likely to have fewer
                 shared vulnerabilities. It uses semantics-preserving
                 code transformations to generate diverse executables,
                 periodically restarting servers with these fresh
                 versions. The periodic restarts help bound the number
                 of compromised replicas that a service ever
                 concurrently runs, and therefore proactive obfuscation
                 makes an adversary's job harder. Proactive obfuscation
                 was used in implementing two prototypes: a distributed
                 firewall based on state-machine replication and a
                 distributed storage service based on quorum systems.
                 Costs intrinsic to supporting proactive obfuscation in
                 replicated systems were evaluated by measuring the
                 performance of these prototypes. The results show that
                 employing proactive obfuscation adds little to the cost
                 of replica-management protocols.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Byzantine fault tolerance; distributed systems;
                 proactive recovery; quorum systems; state machine
                 approach",
}

@Article{Guerraoui:2010:TOT,
  author =       "Rachid Guerraoui and Ron R. Levy and Bastian Pochon
                 and Vivien Qu{\'e}ma",
  title =        "Throughput optimal total order broadcast for cluster
                 environments",
  journal =      j-TOCS,
  volume =       "28",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1813654.1813656",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jul 22 12:42:28 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Total order broadcast is a fundamental communication
                 primitive that plays a central role in bringing cheap
                 software-based high availability to a wide range of
                 services. This article studies the practical
                 performance of such a primitive on a cluster of
                 homogeneous machines.\par

                 We present LCR, the first throughput optimal uniform
                 total order broadcast protocol. LCR is based on a ring
                 topology. It only relies on point-to-point
                 inter-process communication and has a linear latency
                 with respect to the number of processes. LCR is also
                 fair in the sense that each process has an equal
                 opportunity of having its messages delivered by all
                 processes.\par

                 We benchmark a C implementation of LCR against Spread
                 and JGroups, two of the most widely used group
                 communication packages. LCR provides higher throughput
                 than the alternatives, over a large number of
                 scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "cluster computing; replication; software
                 fault-tolerance; total order broadcast",
}

@Article{Amir:2010:SWM,
  author =       "Yair Amir and Claudiu Danilov and Raluca
                 Musu{\~a}loiu-Elefteri and Nilo Rivera",
  title =        "The {SMesh} wireless mesh network",
  journal =      j-TOCS,
  volume =       "28",
  number =       "3",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1841313.1841314",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Sep 30 09:01:34 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Wireless mesh networks extend the connectivity range
                 of mobile devices by using multiple access points, some
                 of them connected to the Internet, to create a mesh
                 topology and forward packets over multiple wireless
                 hops. However, the quality of service provided by the
                 mesh is impaired by the delays and disconnections
                 caused by handoffs, as clients move within the area
                 covered by multiple access points. We present the
                 architecture and protocols of SMesh, the first
                 transparent wireless mesh system that offers seamless,
                 fast handoff, supporting real-time applications such as
                 interactive VoIP. The handoff and routing logic is done
                 solely by the access points, and therefore connectivity
                 is attainable by any 802.11 device. In SMesh, the
                 entire mesh network is seen by the mobile clients as a
                 single, omnipresent access point, giving the mobile
                 clients the illusion that they are stationary. We use
                 multicast for access points coordination and, during
                 handoff transitions, we use more than one access point
                 to handle the moving client. SMesh provides a hybrid
                 routing protocol that optimizes routes over wireless
                 and wired links in a multihomed environment.
                 Experimental results on a fully deployed mesh network
                 demonstrate the effectiveness of the SMesh architecture
                 and its intra-domain and inter-domain handoff
                 protocols.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "fast handoff; inter-domain; intra-domain;
                 micromobility; Wireless mesh networks",
}

@Article{Friedman:2010:PQS,
  author =       "Roy Friedman and Gabriel Kliot and Chen Avin",
  title =        "Probabilistic quorum systems in wireless {Ad Hoc}
                 networks",
  journal =      j-TOCS,
  volume =       "28",
  number =       "3",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1841313.1841315",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Sep 30 09:01:34 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Quorums are a basic construct in solving many
                 fundamental distributed computing problems. One of the
                 known ways of making quorums scalable and efficient is
                 by weakening their intersection guarantee to being
                 probabilistic. This article explores several access
                 strategies for implementing probabilistic quorums in ad
                 hoc networks. In particular, we present the first
                 detailed study of asymmetric probabilistic biquorum
                 systems, that allow to mix different access strategies
                 and different quorums sizes, while guaranteeing the
                 desired intersection probability. We show the
                 advantages of asymmetric probabilistic biquorum systems
                 in ad hoc networks. Such an asymmetric construction is
                 also useful for other types of networks with nonuniform
                 access costs (e.g., peer-to-peer networks). The article
                 includes a formal analysis of these approaches backed
                 up by an extensive simulation-based study. The study
                 explores the impact of various parameters such as
                 network size, network density, mobility speed, and
                 churn. In particular, we show that one of the
                 strategies that uses random walks exhibits the smallest
                 communication overhead, thus being very attractive for
                 ad hoc networks.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
  keywords =     "Distributed middleware; location service; quorums
                 systems; random walks; wireless ad hoc networks",
}

@Article{Blagodurov:2010:CAS,
  author =       "Sergey Blagodurov and Sergey Zhuravlev and Alexandra
                 Fedorova",
  title =        "Contention-Aware Scheduling on Multicore Systems",
  journal =      j-TOCS,
  volume =       "28",
  number =       "4",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1880018.1880019",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 23 17:06:32 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Contention for shared resources on multicore
                 processors remains an unsolved problem in existing
                 systems despite significant research efforts dedicated
                 to this problem in the past. Previous solutions focused
                 primarily on hardware techniques and software page
                 coloring to mitigate this problem. Our goal is to
                 investigate how and to what extent contention for
                 shared resource can be mitigated via thread scheduling.
                 Scheduling is an attractive tool, because it does not
                 require extra hardware and is relatively easy to
                 integrate into the system. Our study is the first to
                 provide a comprehensive analysis of
                 contention-mitigating techniques that use only
                 scheduling. The most difficult part of the problem is
                 to find a classification scheme for threads, which
                 would determine how they affect each other when
                 competing for shared resources.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Cheung:2010:LBC,
  author =       "Alex King Yeung Cheung and Hans-Arno Jacobsen",
  title =        "Load Balancing Content-Based Publish\slash Subscribe
                 Systems",
  journal =      j-TOCS,
  volume =       "28",
  number =       "4",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2010",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1880018.1880020",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 23 17:06:32 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Distributed content-based publish/subscribe systems
                 suffer from performance degradation and poor
                 scalability caused by uneven load distributions typical
                 in real-world applications. The reason for this
                 shortcoming is the lack of a load balancing scheme.
                 This article proposes a load balancing solution
                 specifically tailored to the needs of content-based
                 publish/subscribe systems that is distributed, dynamic,
                 adaptive, transparent, and accommodates heterogeneity.
                 The solution consists of three key contributions: a
                 load balancing framework, a novel load estimation
                 algorithm, and three offload strategies. A working
                 prototype of our solution is built on an open-sourced
                 content-based publish/subscribe system and evaluated on
                 PlanetLab, a cluster testbed, and in simulations.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kim:2011:SSE,
  author =       "Changhoon Kim and Matthew Caesar and Jennifer
                 Rexford",
  title =        "{SEATTLE}: a {Scalable Ethernet Architecture for Large
                 Enterprises}",
  journal =      j-TOCS,
  volume =       "29",
  number =       "1",
  pages =        "1:1--1:35",
  month =        feb,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1925109.1925110",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Feb 28 16:17:43 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "IP networks today require massive effort to configure
                 and manage. Ethernet is vastly simpler to manage, but
                 does not scale beyond small local area networks. This
                 article describes an alternative network architecture
                 called SEATTLE that achieves the best of both worlds:
                 The scalability of IP combined with the simplicity of
                 Ethernet. SEATTLE provides plug-and-play functionality
                 via flat addressing, while ensuring scalability and
                 efficiency through shortest-path routing and hash-based
                 resolution of host information. In contrast to previous
                 work on identity-based routing, SEATTLE ensures path
                 predictability, controllability, and stability, thus
                 simplifying key network-management operations, such as
                 capacity planning, traffic engineering, and
                 troubleshooting. We performed a simulation study driven
                 by real-world traffic traces and network topologies,
                 and used Emulab to evaluate a prototype of our design
                 based on the Click and XORP open-source routing
                 platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lagar-Cavilla:2011:SVM,
  author =       "H. Andr{\'e}s Lagar-Cavilla and Joseph A. Whitney and
                 Roy Bryant and Philip Patchin and Michael Brudno and
                 Eyal de Lara and Stephen M. Rumble and M.
                 Satyanarayanan and Adin Scannell",
  title =        "{SnowFlock}: Virtual Machine Cloning as a First-Class
                 Cloud Primitive",
  journal =      j-TOCS,
  volume =       "29",
  number =       "1",
  pages =        "2:1--2:45",
  month =        feb,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1925109.1925111",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Feb 28 16:17:43 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "A basic building block of cloud computing is
                 virtualization. Virtual machines (VMs) encapsulate a
                 user's computing environment and efficiently isolate it
                 from that of other users. VMs, however, are large
                 entities, and no clear APIs exist yet to provide users
                 with programatic, fine-grained control on short time
                 scales. We present SnowFlock, a paradigm and system for
                 cloud computing that introduces VM cloning as a
                 first-class cloud abstraction. VM cloning exploits the
                 well-understood and effective semantics of UNIX fork.
                 We demonstrate multiple usage models of VM cloning:
                 users can incorporate the primitive in their code, can
                 wrap around existing toolchains via scripting, can
                 encapsulate the API within a parallel programming
                 framework, or can use it to load-balance and self-scale
                 clustered servers.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Meisner:2011:PSA,
  author =       "David Meisner and Brian T. Gold and Thomas F.
                 Wenisch",
  title =        "The {PowerNap} Server Architecture",
  journal =      j-TOCS,
  volume =       "29",
  number =       "1",
  pages =        "3:1--3:24",
  month =        feb,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1925109.1925112",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Feb 28 16:17:43 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Data center power consumption is growing to
                 unprecedented levels: the EPA estimates U.S. data
                 centers will consume 100 billion kilowatt hours
                 annually by 2011. Much of this energy is wasted in idle
                 systems: in typical deployments, server utilization is
                 below 30\%, but idle servers still consume 60\% of
                 their peak power draw. Typical idle periods---though
                 frequent---last seconds or less, confounding simple
                 energy-conservation approaches. In this article, we
                 propose PowerNap, an energy-conservation approach where
                 the entire system transitions rapidly between a
                 high-performance active state and a near-zero-power
                 idle state in response to instantaneous load. Rather
                 than requiring fine-grained power-performance states
                 and complex load-proportional operation from individual
                 system components, PowerNap instead calls for
                 minimizing idle power and transition time, which are
                 simpler optimization goals.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Gupta:2011:DTD,
  author =       "Diwaker Gupta and Kashi Venkatesh Vishwanath and
                 Marvin McNett and Amin Vahdat and Ken Yocum and Alex
                 Snoeren and Geoffrey M. Voelker",
  title =        "{DieCast}: Testing Distributed Systems with an
                 Accurate Scale Model",
  journal =      j-TOCS,
  volume =       "29",
  number =       "2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1963559.1963560",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon May 9 16:05:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Large-scale network services can consist of tens of
                 thousands of machines running thousands of unique
                 software configurations spread across hundreds of
                 physical networks. Testing such services for complex
                 performance problems and configuration errors remains a
                 difficult problem. Existing testing techniques, such as
                 simulation or running smaller instances of a service,
                 have limitations in predicting overall service behavior
                 at such scales. Testing large services should ideally
                 be done at the same scale and configuration as the
                 target deployment, which can be technically and
                 economically infeasible. We present DieCast, an
                 approach to scaling network services in which we
                 multiplex all of the nodes in a given service
                 configuration as virtual machines across a much smaller
                 number of physical machines in a test harness.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yadgar:2011:MMM,
  author =       "Gala Yadgar and Michael Factor and Kai Li and Assaf
                 Schuster",
  title =        "Management of Multilevel, Multiclient Cache
                 Hierarchies with Application Hints",
  journal =      j-TOCS,
  volume =       "29",
  number =       "2",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1963559.1963561",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon May 9 16:05:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Multilevel caching, common in many storage
                 configurations, introduces new challenges to
                 traditional cache management: data must be kept in the
                 appropriate cache and replication avoided across the
                 various cache levels. Additional challenges are
                 introduced when the lower levels of the hierarchy are
                 shared by multiple clients. Sharing can have both
                 positive and negative effects. While data fetched by
                 one client can be used by another client without
                 incurring additional delays, clients competing for
                 cache buffers can evict each other's blocks and
                 interfere with exclusive caching schemes. We present a
                 global noncentralized, dynamic and informed management
                 policy for multiple levels of cache, accessed by
                 multiple clients.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{deBruijn:2011:ATS,
  author =       "Willem de Bruijn and Herbert Bos and Henri Bal",
  title =        "Application-Tailored {I/O} with {Streamline}",
  journal =      j-TOCS,
  volume =       "29",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/1963559.1963562",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon May 9 16:05:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Streamline is a stream-based OS communication
                 subsystem that spans from peripheral hardware to
                 userspace processes. It improves performance of
                 I/O-bound applications (such as webservers and
                 streaming media applications) by constructing
                 tailor-made I/O paths through the operating system for
                 each application at runtime. Path optimization removes
                 unnecessary copying, context switching and cache
                 replacement and integrates specialized hardware.
                 Streamline automates optimization and only presents
                 users a clear, concise job control language based on
                 Unix pipelines. For backward compatibility Streamline
                 also presents well known files, pipes and sockets
                 abstractions.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ayari:2011:DPR,
  author =       "Brahim Ayari and Abdelmajid Khelil and Neeraj Suri",
  title =        "On the design of perturbation-resilient atomic commit
                 protocols for mobile transactions",
  journal =      j-TOCS,
  volume =       "29",
  number =       "3",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2003690.2003691",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Aug 24 18:08:12 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Distributed mobile transactions utilize commit
                 protocols to achieve atomicity and consistent
                 decisions. This is challenging, as mobile environments
                 are typically characterized by frequent perturbations
                 such as network disconnections and node failures. On
                 one hand environmental constraints on mobile
                 participants and wireless links may increase the
                 resource blocking time of fixed participants. On the
                 other hand frequent node and link failures complicate
                 the design of atomic commit protocols by increasing
                 both the transaction abort rate and resource blocking
                 time. Hence, the deployment of classical commit
                 protocols (such as two-phase commit) does not
                 reasonably extend to distributed infrastructure-based
                 mobile environments driving the need for
                 perturbation-resilient commit protocols.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kalibera:2011:SRT,
  author =       "Tomas Kalibera and Filip Pizlo and Antony L. Hosking
                 and Jan Vitek",
  title =        "Scheduling real-time garbage collection on
                 uniprocessors",
  journal =      j-TOCS,
  volume =       "29",
  number =       "3",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2003690.2003692",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Aug 24 18:08:12 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Managed languages such as Java and C\# are
                 increasingly being considered for hard real-time
                 applications because of their productivity and software
                 engineering advantages. Automatic memory management, or
                 garbage collection, is a key enabler for robust,
                 reusable libraries, yet remains a challenge for
                 analysis and implementation of real-time execution
                 environments. This article comprehensively compares
                 leading approaches to hard real-time garbage
                 collection. There are many design decisions involved in
                 selecting a real-time garbage collection algorithm. For
                 time-based garbage collectors on uniprocessors one must
                 choose whether to use periodic, slack-based or hybrid
                 scheduling. A significant impediment to valid
                 experimental comparison of such choices is that
                 commercial implementations use completely different
                 proprietary infrastructures.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Reddi:2011:MPE,
  author =       "Vijay Janapa Reddi and Benjamin C. Lee and Trishul
                 Chilimbi and Kushagra Vaid",
  title =        "Mobile processors for energy-efficient web search",
  journal =      j-TOCS,
  volume =       "29",
  number =       "3",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2003690.2003693",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Aug 24 18:08:12 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "As cloud and utility computing spreads, computer
                 architects must ensure continued capability growth for
                 the data centers that comprise the cloud. Given
                 megawatt scale power budgets, increasing data center
                 capability requires increasing computing hardware
                 energy efficiency. To increase the data center's
                 capability for work, the work done per Joule must
                 increase. We pursue this efficiency even as the nature
                 of data center applications evolves. Unlike traditional
                 enterprise workloads, which are typically memory or I/O
                 bound, big data computation and analytics exhibit
                 greater compute intensity. This article examines the
                 efficiency of mobile processors as a means for data
                 center capability. In particular, we compare and
                 contrast the performance and efficiency of the
                 Microsoft Bing search engine executing on the
                 mobile-class Atom processor and the server-class Xeon
                 processor.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Srivatsa:2011:ESA,
  author =       "Mudhakar Srivatsa and Ling Liu and Arun Iyengar",
  title =        "{EventGuard}: a System Architecture for Securing
                 Publish--Subscribe Networks",
  journal =      j-TOCS,
  volume =       "29",
  number =       "4",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2063509.2063510",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Dec 30 17:52:02 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Publish-subscribe (pub-sub) is an emerging paradigm
                 for building a large number of distributed systems. A
                 wide area pub-sub system is usually implemented on an
                 overlay network infrastructure to enable information
                 dissemination from publishers to subscribers. Using an
                 open overlay network raises several security concerns
                 such as: confidentiality and integrity, authentication,
                 authorization and Denial-of-Service (DoS) attacks. In
                 this article we present EventGuard, a framework for
                 building secure wide-area pub-sub systems. The
                 EventGuard architecture is comprised of three key
                 components: (1) a suite of security guards that can be
                 seamlessly plugged-into a content-based pub-sub system,
                 (2) a scalable key management algorithm to enforce
                 access control on subscribers, and (3) a resilient
                 pub-sub network design that is capable of scalable
                 routing, handling message dropping-based DoS attacks,
                 and node failures.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Marinescu:2011:ETR,
  author =       "Paul D. Marinescu and George Candea",
  title =        "Efficient Testing of Recovery Code Using Fault
                 Injection",
  journal =      j-TOCS,
  volume =       "29",
  number =       "4",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2063509.2063511",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Dec 30 17:52:02 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "A critical part of developing a reliable software
                 system is testing its recovery code. This code is
                 traditionally difficult to test in the lab, and, in the
                 field, it rarely gets to run; yet, when it does run, it
                 must execute flawlessly in order to recover the system
                 from failure. In this article, we present a
                 library-level fault injection engine that enables the
                 productive use of fault injection for software testing.
                 We describe automated techniques for reliably
                 identifying errors that applications may encounter when
                 interacting with their environment, for automatically
                 identifying high-value injection targets in program
                 binaries, and for producing efficient injection test
                 scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Mahajan:2011:DCS,
  author =       "Prince Mahajan and Srinath Setty and Sangmin Lee and
                 Allen Clement and Lorenzo Alvisi and Mike Dahlin and
                 Michael Walfish",
  title =        "{Depot}: Cloud Storage with Minimal Trust",
  journal =      j-TOCS,
  volume =       "29",
  number =       "4",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2011",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2063509.2063512",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Dec 30 17:52:02 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article describes the design, implementation, and
                 evaluation of Depot, a cloud storage system that
                 minimizes trust assumptions. Depot tolerates buggy or
                 malicious behavior by any number of clients or servers,
                 yet it provides safety and liveness guarantees to
                 correct clients. Depot provides these guarantees using
                 a two-layer architecture. First, Depot ensures that the
                 updates observed by correct nodes are consistently
                 ordered under Fork-Join-Causal consistency (FJC). FJC
                 is a slight weakening of causal consistency that can be
                 both safe and live despite faulty nodes. Second, Depot
                 implements protocols that use this consistent ordering
                 of updates to provide other desirable consistency,
                 staleness, durability, and recovery properties.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Mowry:2012:ISI,
  author =       "Todd C. Mowry",
  title =        "Introduction to Special Issue {APLOS 2011}",
  journal =      j-TOCS,
  volume =       "30",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2110356.2110357",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Mar 1 16:31:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Chipounov:2012:SPD,
  author =       "Vitaly Chipounov and Volodymyr Kuznetsov and George
                 Candea",
  title =        "The {S2E} Platform: Design, Implementation, and
                 Applications",
  journal =      j-TOCS,
  volume =       "30",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2110356.2110358",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Mar 1 16:31:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article presents S2E, a platform for analyzing
                 the properties and behavior of software systems, along
                 with its use in developing tools for comprehensive
                 performance profiling, reverse engineering of
                 proprietary software, and automated testing of
                 kernel-mode and user-mode binaries. Conceptually, S2E
                 is an automated path explorer with modular path
                 analyzers: the explorer uses a symbolic execution
                 engine to drive the target system down all execution
                 paths of interest, while analyzers measure and/or check
                 properties of each such path. S2E users can either
                 combine existing analyzers to build custom analysis
                 tools, or they can directly use S2E's APIs. S2E's
                 strength is the ability to scale to large systems, such
                 as a full Windows stack, using two new ideas: selective
                 symbolic execution, a way to automatically minimize the
                 amount of code that has to be executed symbolically
                 given a target analysis, and execution consistency
                 models, a way to make principled performance/accuracy
                 trade-offs \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Veeraraghavan:2012:DPS,
  author =       "Kaushik Veeraraghavan and Dongyoon Lee and Benjamin
                 Wester and Jessica Ouyang and Peter M. Chen and Jason
                 Flinn and Satish Narayanasamy",
  title =        "{DoublePlay}: Parallelizing Sequential Logging and
                 Replay",
  journal =      j-TOCS,
  volume =       "30",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2110356.2110359",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Mar 1 16:31:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Deterministic replay systems record and reproduce the
                 execution of a hardware or software system. In contrast
                 to replaying execution on uniprocessors, deterministic
                 replay on multiprocessors is very challenging to
                 implement efficiently because of the need to reproduce
                 the order of or the values read by shared memory
                 operations performed by multiple threads. In this
                 paper, we present DoublePlay, a new way to efficiently
                 guarantee replay on commodity multiprocessors. Our key
                 insight is that one can use the simpler and faster
                 mechanisms of single-processor record and replay, yet
                 still achieve the scalability offered by multiple
                 cores, by using an additional execution to parallelize
                 the record and replay of an application.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yuan:2012:ISD,
  author =       "Ding Yuan and Jing Zheng and Soyeon Park and Yuanyuan
                 Zhou and Stefan Savage",
  title =        "Improving Software Diagnosability via Log
                 Enhancement",
  journal =      j-TOCS,
  volume =       "30",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2110356.2110360",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Mar 1 16:31:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Diagnosing software failures in the field is
                 notoriously difficult, in part due to the fundamental
                 complexity of troubleshooting any complex software
                 system, but further exacerbated by the paucity of
                 information that is typically available in the
                 production setting. Indeed, for reasons of both
                 overhead and privacy, it is common that only the
                 run-time log generated by a system (e.g., syslog) can
                 be shared with the developers. Unfortunately, the
                 ad-hoc nature of such reports are frequently
                 insufficient for detailed failure diagnosis. This paper
                 seeks to improve this situation within the rubric of
                 existing practice. We describe a tool, LogEnhancer that
                 automatically ``enhances'' existing logging code to aid
                 in future post-failure debugging.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Schupbach:2012:DLA,
  author =       "Adrian Sch{\"u}pbach and Andrew Baumann and Timothy
                 Roscoe and Simon Peter",
  title =        "A Declarative Language Approach to Device
                 Configuration",
  journal =      j-TOCS,
  volume =       "30",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2110356.2110361",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Mar 1 16:31:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "C remains the language of choice for hardware
                 programming (device drivers, bus configuration, etc.):
                 it is fast, allows low-level access, and is trusted by
                 OS developers. However, the algorithms required to
                 configure and reconfigure hardware devices and
                 interconnects are becoming more complex and diverse,
                 with the added burden of legacy support, ``quirks,''
                 and hardware bugs to work around. Even programming PCI
                 bridges in a modern PC is a surprisingly complex
                 problem, and is getting worse as new functionality such
                 as hotplug appears. Existing approaches use relatively
                 simple algorithms, hard-coded in C and closely coupled
                 with low-level register access code, generally leading
                 to suboptimal configurations.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Saez:2012:LCS,
  author =       "Juan Carlos Saez and Alexandra Fedorova and David
                 Koufaty and Manuel Prieto",
  title =        "Leveraging Core Specialization via {OS} Scheduling to
                 Improve Performance on Asymmetric Multicore Systems",
  journal =      j-TOCS,
  volume =       "30",
  number =       "2",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2166879.2166880",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 27 12:10:22 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Asymmetric multicore processors (AMPs) consist of
                 cores with the same ISA (instruction-set architecture),
                 but different microarchitectural features, speed, and
                 power consumption. Because cores with more complex
                 features and higher speed typically use more area and
                 consume more energy relative to simpler and slower
                 cores, we must use these cores for running applications
                 that experience significant performance improvements
                 from using those features. Having cores of different
                 types in a single system allows optimizing the
                 performance/energy trade-off. To deliver this potential
                 to unmodified applications, the OS scheduler must map
                 threads to cores in consideration of the properties of
                 both. Our work describes a Comprehensive scheduler for
                 Asymmetric Multicore Processors (CAMP) that addresses
                 shortcomings of previous asymmetry-aware schedulers.
                 First, previous schedulers catered to only one kind of
                 workload properties that are crucial for scheduling on
                 AMPs; either efficiency or thread-level parallelism
                 (TLP), but not both. CAMP overcomes this limitation
                 showing how using both efficiency and TLP in synergy in
                 a single scheduling algorithm can improve performance.
                 Second, most existing schedulers relying on models for
                 estimating how much faster a thread executes on a
                 ``fast'' vs. ``slow'' core (i.e., the speedup factor )
                 were specifically designed for AMP systems where cores
                 differ only in clock frequency. However, more realistic
                 AMP systems include cores that differ more
                 significantly in their features. To demonstrate the
                 effectiveness of CAMP on more realistic scenarios, we
                 augmented the CAMP scheduler with a model that predicts
                 the speedup factor on a real AMP prototype that closely
                 matches future asymmetric systems.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ebrahimi:2012:FST,
  author =       "Eiman Ebrahimi and Chang Joo Lee and Onur Mutlu and
                 Yale N. Patt",
  title =        "Fairness via Source Throttling: a Configurable and
                 High-Performance Fairness Substrate for Multicore
                 Memory Systems",
  journal =      j-TOCS,
  volume =       "30",
  number =       "2",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2166879.2166881",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 27 12:10:22 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Cores in chip-multiprocessors (CMPs) share multiple
                 memory subsystem resources. If resource sharing is
                 unfair, some applications can be delayed significantly
                 while others are unfairly prioritized. Previous
                 research proposed separate fairness mechanisms for each
                 resource. Such resource-based fairness mechanisms
                 implemented independently in each resource can make
                 contradictory decisions, leading to low fairness and
                 performance loss. Therefore, a coordinated mechanism
                 that provides fairness in the entire shared memory
                 system is desirable. This article proposes a new
                 approach that provides fairness in the entire shared
                 memory system, thereby eliminating the need for and
                 complexity of developing fairness mechanisms for each
                 resource. Our technique, Fairness via Source Throttling
                 (FST), estimates unfairness in the entire memory
                 system. If unfairness is above a system-software-set
                 threshold, FST throttles down cores causing unfairness
                 by limiting the number of requests they create and the
                 frequency at which they do. As such, our source-based
                 fairness control ensures fairness decisions are made in
                 tandem in the entire memory system. FST enforces thread
                 priorities/weights, and enables system-software to
                 enforce different fairness objectives in the memory
                 system. Our evaluations show that FST provides the best
                 system fairness and performance compared to three
                 systems with state-of-the-art fairness mechanisms
                 implemented in both shared caches and memory
                 controllers.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Gebhart:2012:HTS,
  author =       "Mark Gebhart and Daniel R. Johnson and David Tarjan
                 and Stephen W. Keckler and William J. Dally and Erik
                 Lindholm and Kevin Skadron",
  title =        "A Hierarchical Thread Scheduler and Register File for
                 Energy-Efficient Throughput Processors",
  journal =      j-TOCS,
  volume =       "30",
  number =       "2",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2166879.2166882",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Apr 27 12:10:22 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Modern graphics processing units (GPUs) employ a large
                 number of hardware threads to hide both function unit
                 and memory access latency. Extreme multithreading
                 requires a complex thread scheduler as well as a large
                 register file, which is expensive to access both in
                 terms of energy and latency. We present two
                 complementary techniques for reducing energy on
                 massively-threaded processors such as GPUs. First, we
                 investigate a two-level thread scheduler that maintains
                 a small set of active threads to hide ALU and local
                 memory access latency and a larger set of pending
                 threads to hide main memory latency. Reducing the
                 number of threads that the scheduler must consider each
                 cycle improves the scheduler's energy efficiency.
                 Second, we propose replacing the monolithic register
                 file found on modern designs with a hierarchical
                 register file. We explore various trade-offs for the
                 hierarchy including the number of levels in the
                 hierarchy and the number of entries at each level. We
                 consider both a hardware-managed caching scheme and a
                 software-managed scheme, where the compiler is
                 responsible for orchestrating all data movement within
                 the register file hierarchy. Combined with a
                 hierarchical register file, our two-level thread
                 scheduler provides a further reduction in energy by
                 only allocating entries in the upper levels of the
                 register file hierarchy for active threads. Averaging
                 across a variety of real world graphics and compute
                 workloads, the active thread count can be reduced by a
                 factor of 4 with minimal impact on performance and our
                 most efficient three-level software-managed register
                 file hierarchy reduces register file energy by 54\%.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Dall:2012:DIE,
  author =       "Christoffer Dall and Jeremy Andrus and Alexander Van't
                 Hof and Oren Laadan and Jason Nieh",
  title =        "The Design, Implementation, and Evaluation of Cells:
                 a Virtual {Smartphone} Architecture",
  journal =      j-TOCS,
  volume =       "30",
  number =       "3",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2324876.2324877",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Aug 20 16:33:58 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Smartphones are increasingly ubiquitous, and many
                 users carry multiple phones to accommodate work,
                 personal, and geographic mobility needs. We present
                 Cells, a virtualization architecture for enabling
                 multiple virtual smartphones to run simultaneously on
                 the same physical cellphone in an isolated, secure
                 manner. Cells introduces a usage model of having one
                 foreground virtual phone and multiple background
                 virtual phones. This model enables a new device
                 namespace mechanism and novel device proxies that
                 integrate with lightweight operating system
                 virtualization to multiplex phone hardware across
                 multiple virtual phones while providing native hardware
                 device performance. Cells virtual phone features
                 include fully accelerated 3D graphics, complete power
                 management features, and full telephony functionality
                 with separately assignable telephone numbers and caller
                 ID support. We have implemented a prototype of Cells
                 that supports multiple Android virtual phones on the
                 same phone. Our performance results demonstrate that
                 Cells imposes only modest runtime and memory overhead,
                 works seamlessly across multiple hardware devices
                 including Google Nexus 1 and Nexus S phones, and
                 transparently runs Android applications at native speed
                 without any modifications.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Harter:2012:FFU,
  author =       "Tyler Harter and Chris Dragga and Michael Vaughn and
                 Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau",
  title =        "A File Is Not a File: Understanding the {I/O} Behavior
                 of {Apple} Desktop Applications",
  journal =      j-TOCS,
  volume =       "30",
  number =       "3",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2324876.2324878",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Aug 20 16:33:58 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We analyze the I/O behavior of iBench, a new
                 collection of productivity and multimedia application
                 workloads. Our analysis reveals a number of differences
                 between iBench and typical file-system workload
                 studies, including the complex organization of modern
                 files, the lack of pure sequential access, the
                 influence of underlying frameworks on I/O patterns, the
                 widespread use of file synchronization and atomic
                 operations, and the prevalence of threads. Our results
                 have strong ramifications for the design of next
                 generation local and cloud-based storage systems.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Esmaeilzadeh:2012:PLD,
  author =       "Hadi Esmaeilzadeh and Emily Blem and Ren{\'e}e {St.
                 Amant} and Karthikeyan Sankaralingam and Doug Burger",
  title =        "Power Limitations and Dark Silicon Challenge the
                 Future of Multicore",
  journal =      j-TOCS,
  volume =       "30",
  number =       "3",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2324876.2324879",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Aug 20 16:33:58 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Since 2004, processor designers have increased core
                 counts to exploit Moore's Law scaling, rather than
                 focusing on single-core performance. The failure of
                 Dennard scaling, to which the shift to multicore parts
                 is partially a response, may soon limit multicore
                 scaling just as single-core scaling has been curtailed.
                 This paper models multicore scaling limits by combining
                 device scaling, single-core scaling, and multicore
                 scaling to measure the speedup potential for a set of
                 parallel workloads for the next five technology
                 generations. For device scaling, we use both the ITRS
                 projections and a set of more conservative device
                 scaling parameters. To model single-core scaling, we
                 combine measurements from over 150 processors to derive
                 Pareto-optimal frontiers for area/performance and
                 power/performance. Finally, to model multicore scaling,
                 we build a detailed performance model of upper-bound
                 performance and lower-bound core power. The multicore
                 designs we study include single-threaded CPU-like and
                 massively threaded GPU-like multicore chip
                 organizations with symmetric, asymmetric, dynamic, and
                 composed topologies. The study shows that regardless of
                 chip organization and topology, multicore scaling is
                 power limited to a degree not widely appreciated by the
                 computing community. Even at 22 nm (just one year from
                 now), 21\% of a fixed-size chip must be powered off,
                 and at 8 nm, this number grows to more than 50\%.
                 Through 2024, only 7.9$\times$ average speedup is
                 possible across commonly used parallel workloads for
                 the topologies we study, leaving a nearly 24-fold gap
                 from a target of doubled performance per generation.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bugnion:2012:BVX,
  author =       "Edouard Bugnion and Scott Devine and Mendel Rosenblum
                 and Jeremy Sugerman and Edward Y. Wang",
  title =        "Bringing Virtualization to the x86 Architecture with
                 the Original {VMware} Workstation",
  journal =      j-TOCS,
  volume =       "30",
  number =       "4",
  pages =        "12:1--12:51",
  month =        nov,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2382553.2382554",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 19:34:49 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article describes the historical context,
                 technical challenges, and main implementation
                 techniques used by VMware Workstation to bring
                 virtualization to the x86 architecture in 1999.
                 Although virtual machine monitors (VMMs) had been
                 around for decades, they were traditionally designed as
                 part of monolithic, single-vendor architectures with
                 explicit support for virtualization. In contrast, the
                 x86 architecture lacked virtualization support, and the
                 industry around it had disaggregated into an ecosystem,
                 with different vendors controlling the computers, CPUs,
                 peripherals, operating systems, and applications, none
                 of them asking for virtualization. We chose to build
                 our solution independently of these vendors. As a
                 result, VMware Workstation had to deal with new
                 challenges associated with (i) the lack of
                 virtualization support in the x86 architecture, (ii)
                 the daunting complexity of the architecture itself,
                 (iii) the need to support a broad combination of
                 peripherals, and (iv) the need to offer a simple user
                 experience within existing environments. These new
                 challenges led us to a novel combination of well-known
                 virtualization techniques, techniques from other
                 domains, and new techniques. VMware Workstation
                 combined a hosted architecture with a VMM. The hosted
                 architecture enabled a simple user experience and
                 offered broad hardware compatibility. Rather than
                 exposing I/O diversity to the virtual machines, VMware
                 Workstation also relied on software emulation of I/O
                 devices. The VMM combined a trap-and-emulate direct
                 execution engine with a system-level dynamic binary
                 translator to efficiently virtualize the x86
                 architecture and support most commodity operating
                 systems. By relying on x86 hardware segmentation as a
                 protection mechanism, the binary translator could
                 execute translated code at near hardware speeds. The
                 binary translator also relied on partial evaluation and
                 adaptive retranslation to reduce the overall overheads
                 of virtualization. Written with the benefit of
                 hindsight, this article shares the key lessons we
                 learned from building the original system and from its
                 later evolution.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Erlingsson:2012:FED,
  author =       "{\'U}lfar Erlingsson and Marcus Peinado and Simon
                 Peter and Mihai Budiu and Gloria Mainar-Ruiz",
  title =        "{Fay}: Extensible Distributed Tracing from Kernels to
                 Clusters",
  journal =      j-TOCS,
  volume =       "30",
  number =       "4",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2382553.2382555",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 19:34:49 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Fay is a flexible platform for the efficient
                 collection, processing, and analysis of software
                 execution traces. Fay provides dynamic tracing through
                 use of runtime instrumentation and distributed
                 aggregation within machines and across clusters. At the
                 lowest level, Fay can be safely extended with new
                 tracing primitives, including even untrusted, fully
                 optimized machine code, and Fay can be applied to
                 running user-mode or kernel-mode software without
                 compromising system stability. At the highest level,
                 Fay provides a unified, declarative means of specifying
                 what events to trace, as well as the aggregation,
                 processing, and analysis of those events. We have
                 implemented the Fay tracing platform for Windows and
                 integrated it with two powerful, expressive systems for
                 distributed programming. Our implementation is easy to
                 use, can be applied to unmodified production systems,
                 and provides primitives that allow the overhead of
                 tracing to be greatly reduced, compared to previous
                 dynamic tracing platforms. To show the generality of
                 Fay tracing, we reimplement, in experiments, a range of
                 tracing strategies and several custom mechanisms from
                 existing tracing frameworks. Fay shows that modern
                 techniques for high-level querying and data-parallel
                 processing of disagreggated data streams are well
                 suited to comprehensive monitoring of software
                 execution in distributed systems. Revisiting a lesson
                 from the late 1960s [Deutsch and Grant 1971], Fay also
                 demonstrates the efficiency and extensibility benefits
                 of using safe, statically verified machine code as the
                 basis for low-level execution tracing. Finally, Fay
                 establishes that, by automatically deriving optimized
                 query plans and code for safe extensions, the
                 expressiveness and performance of high-level tracing
                 queries can equal or even surpass that of specialized
                 monitoring tools.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Gandhi:2012:ADR,
  author =       "Anshul Gandhi and Mor Harchol-Balter and Ram
                 Raghunathan and Michael A. Kozuch",
  title =        "{AutoScale}: Dynamic, Robust Capacity Management for
                 Multi-Tier Data Centers",
  journal =      j-TOCS,
  volume =       "30",
  number =       "4",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2382553.2382556",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 19:34:49 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Energy costs for data centers continue to rise,
                 already exceeding \$15 billion yearly. Sadly much of
                 this power is wasted. Servers are only busy 10--30\% of
                 the time on average, but they are often left on, while
                 idle, utilizing 60\% or more of peak power when in the
                 idle state. We introduce a dynamic capacity management
                 policy, AutoScale, that greatly reduces the number of
                 servers needed in data centers driven by unpredictable,
                 time-varying load, while meeting response time SLAs.
                 AutoScale scales the data center capacity, adding or
                 removing servers as needed. AutoScale has two key
                 features: (i) it autonomically maintains just the right
                 amount of spare capacity to handle bursts in the
                 request rate; and (ii) it is robust not just to changes
                 in the request rate of real-world traces, but also
                 request size and server efficiency. We evaluate our
                 dynamic capacity management approach via implementation
                 on a 38-server multi-tier data center, serving a web
                 site of the type seen in Facebook or Amazon, with a
                 key-value store workload. We demonstrate that AutoScale
                 vastly improves upon existing dynamic capacity
                 management policies with respect to meeting SLAs and
                 robustness.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ferdman:2012:QMB,
  author =       "Michael Ferdman and Almutaz Adileh and Onur Kocberber
                 and Stavros Volos and Mohammad Alisafaee and Djordje
                 Jevdjic and Cansu Kaynak and Adrian Daniel Popescu and
                 Anastasia Ailamaki and Babak Falsafi",
  title =        "Quantifying the Mismatch between Emerging Scale-Out
                 Applications and Modern Processors",
  journal =      j-TOCS,
  volume =       "30",
  number =       "4",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2012",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2382553.2382557",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Nov 29 19:34:49 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Emerging scale-out workloads require extensive amounts
                 of computational resources. However, data centers using
                 modern server hardware face physical constraints in
                 space and power, limiting further expansion and calling
                 for improvements in the computational density per
                 server and in the per-operation energy. Continuing to
                 improve the computational resources of the cloud while
                 staying within physical constraints mandates optimizing
                 server efficiency to ensure that server hardware
                 closely matches the needs of scale-out workloads. In
                 this work, we introduce CloudSuite, a benchmark suite
                 of emerging scale-out workloads. We use performance
                 counters on modern servers to study scale-out
                 workloads, finding that today's predominant processor
                 microarchitecture is inefficient for running these
                 workloads. We find that inefficiency comes from the
                 mismatch between the workload needs and modern
                 processors, particularly in the organization of
                 instruction and data memory systems and the processor
                 core microarchitecture. Moreover, while today's
                 predominant microarchitecture is inefficient when
                 executing scale-out workloads, we find that continuing
                 the current trends will further exacerbate the
                 inefficiency in the future. In this work, we identify
                 the key microarchitectural needs of scale-out
                 workloads, calling for a change in the trajectory of
                 server processors that would lead to improved
                 computational density and power efficiency in data
                 centers.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Wu:2013:ERD,
  author =       "Meng-Ju Wu and Donald Yeung",
  title =        "Efficient Reuse Distance Analysis of Multicore Scaling
                 for Loop-Based Parallel Programs",
  journal =      j-TOCS,
  volume =       "31",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2427631.2427632",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 23 06:37:57 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Reuse Distance (RD) analysis is a powerful memory
                 analysis tool that can potentially help architects
                 study multicore processor scaling. One key obstacle,
                 however, is that multicore RD analysis requires
                 measuring Concurrent Reuse Distance (CRD) and
                 Private-LRU-stack Reuse Distance (PRD) profiles across
                 thread-interleaved memory reference streams.
                 Sensitivity to memory interleaving makes CRD and PRD
                 profiles architecture dependent, preventing them from
                 analyzing different processor configurations. For
                 loop-based parallel programs, CRD and PRD profiles
                 shift coherently across RD values with core count
                 scaling because interleaving threads are symmetric.
                 Simple techniques can predict such shifting, making the
                 analysis of numerous multicore configurations from a
                 small set of CRD and PRD profiles feasible. Given the
                 ubiquity of parallel loops, such techniques will be
                 extremely valuable for studying future large multicore
                 designs. This article investigates using RD analysis to
                 efficiently analyze multicore cache performance for
                 loop-based parallel programs, making several
                 contributions. First, we provide an in-depth analysis
                 on how CRD and PRD profiles change with core count
                 scaling. Second, we develop techniques to predict CRD
                 and PRD profile scaling, in particular employing
                 reference groups [Zhong et al. 2003] to predict
                 coherent shift, demonstrating 90\% or greater
                 prediction accuracy. Third, our CRD and PRD profile
                 analyses define two application parameters with
                 architectural implications: C$_{core}$ is the minimum
                 shared cache capacity that ``contains'' locality
                 degradation due to core count scaling, and C$_{share}$
                 is the capacity at which shared caches begin to provide
                 a cache-miss reduction compared to private caches. And
                 fourth, we apply CRD and PRD profiles to analyze
                 multicore cache performance. When combined with
                 existing problem scaling prediction, our techniques can
                 predict shared LLC MPKI (private L2 cache MPKI) to
                 within 10.7\% (13.9\% ) of simulation across 1,728
                 (1,440) configurations using only 36 measured CRD (PRD)
                 profiles.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Govindan:2013:ADP,
  author =       "Sriram Govindan and Di Wang and Anand Sivasubramaniam
                 and Bhuvan Urgaonkar",
  title =        "Aggressive Datacenter Power Provisioning with
                 Batteries",
  journal =      j-TOCS,
  volume =       "31",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2427631.2427633",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 23 06:37:57 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Datacenters spend \$10--\$25 per watt in
                 provisioning their power infrastructure, regardless of
                 the watts actually consumed. Since peak power needs
                 arise rarely, provisioning power infrastructure for
                 them can be expensive. One can, thus, aggressively
                 underprovision infrastructure assuming that
                 simultaneous peak draw across all equipment will happen
                 rarely. The resulting nonzero probability of emergency
                 events where power needs exceed provisioned capacity,
                 however small, mandates graceful reaction mechanisms to
                 cap the power draw instead of leaving it to disruptive
                 circuit breakers/fuses. Existing strategies for power
                 capping use temporal knobs local to a server that
                 throttle the rate of execution (using power modes),
                 and/or spatial knobs that redirect/migrate excess load
                 to regions of the datacenter with more power headroom.
                 We show these mechanisms to have performance degrading
                 ramifications, and propose an entirely orthogonal
                 solution that leverages existing UPS batteries to
                 temporarily augment the utility supply during
                 emergencies.We build an experimental prototype to
                 demonstrate such power capping on a cluster of 8
                 servers, each with an individual battery, and implement
                 several online heuristics in the context of different
                 datacenter workloads to evaluate their effectiveness in
                 handling power emergencies. We show that our
                 battery-based solution can: (i) handle emergencies of
                 short durations on its own, (ii) supplement existing
                 reaction mechanisms to enhance their efficacy for
                 longer emergencies, and (iii) create more slack for
                 shifting applications temporarily to nonpeak
                 durations.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Rasmussen:2013:TBE,
  author =       "Alexander Rasmussen and George Porter and Michael
                 Conley and Harsha V. Madhyastha and Radhika Niranjan
                 Mysore and Alexander Pucher and Amin Vahdat",
  title =        "{TritonSort}: a Balanced and Energy-Efficient
                 Large-Scale Sorting System",
  journal =      j-TOCS,
  volume =       "31",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2427631.2427634",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 23 06:37:57 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We present TritonSort, a highly efficient, scalable
                 sorting system. It is designed to process large
                 datasets, and has been evaluated against as much as
                 100TB of input data spread across 832 disks in 52 nodes
                 at a rate of 0.938TB/min. When evaluated against the
                 annual Indy GraySort sorting benchmark, TritonSort is
                 66\% better in absolute performance and has over six
                 times the per-node throughput of the previous record
                 holder. When evaluated against the 100TB Indy JouleSort
                 benchmark, TritonSort sorted 9703 records/Joule. In
                 this article, we describe the hardware and software
                 architecture necessary to operate TritonSort at this
                 level of efficiency. Through careful management of
                 system resources to ensure cross-resource balance, we
                 are able to sort data at approximately 80\% of the
                 disks' aggregate sequential write speed. We believe the
                 work holds a number of lessons for balanced system
                 design and for scale-out architectures in general.
                 While many interesting systems are able to scale
                 linearly with additional servers, per-server
                 performance can lag behind per-server capacity by more
                 than an order of magnitude. Bridging the gap between
                 high scalability and high performance would enable
                 either significantly less expensive systems that are
                 able to do the same work or provide the ability to
                 address significantly larger problem sets with the same
                 infrastructure.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jayaram:2013:PCB,
  author =       "K. R. Jayaram and Patrick Eugster and Chamikara
                 Jayalath",
  title =        "Parametric Content-Based Publish\slash Subscribe",
  journal =      j-TOCS,
  volume =       "31",
  number =       "2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2465346.2465347",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Jun 1 11:24:04 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Content-based publish/subscribe (CPS) is an appealing
                 abstraction for building scalable distributed systems,
                 e.g., message boards, intrusion detectors, or
                 algorithmic stock trading platforms. Recently, CPS
                 extensions have been proposed for location-based
                 services like vehicular networks, mobile social
                 networking, and so on. Although current CPS middleware
                 systems are dynamic in the way they support the joining
                 and leaving of publishers and subscribers, they fall
                 short in supporting subscription adaptations. These are
                 becoming increasingly important across many CPS
                 applications. In algorithmic high frequency trading,
                 for instance, stock price thresholds that are of
                 interest to a trader change rapidly, and gains directly
                 hinge on the reaction time to relevant fluctuations
                 rather than fixed values. In location-aware
                 applications, a subscription is a function of the
                 subscriber location (e.g. GPS coordinates), which
                 inherently changes during motion. The common solution
                 for adapting a subscription consists of a
                 resubscription, where a new subscription is issued and
                 the superseded one canceled. This incurs substantial
                 overhead in CPS middleware systems, and leads to missed
                 or duplicated events during the transition. In this
                 article, we explore the concept of parametric
                 subscriptions for capturing subscription adaptations.
                 We discuss desirable and feasible guarantees for
                 corresponding support, and propose novel algorithms for
                 updating routing mechanisms effectively and efficiently
                 in classic decentralized CPS broker overlay networks.
                 Compared to resubscriptions, our algorithms
                 significantly improve the reaction time to subscription
                 updates without hampering throughput or latency under
                 high update rates. We also propose and evaluate
                 approximation techniques to detect and mitigate
                 pathological cases of high frequency subscription
                 oscillations, which could significantly decrease the
                 throughput of CPS systems thereby affecting other
                 subscribers. We analyze the benefits of our support
                 through implementations of our algorithms in two CPS
                 systems, and by evaluating our algorithms on two
                 different application scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Smaldone:2013:OSP,
  author =       "Stephen Smaldone and Benjamin Gilbert and Jan Harkes
                 and Liviu Iftode and Mahadev Satyanarayanan",
  title =        "Optimizing Storage Performance for {VM}-Based Mobile
                 Computing",
  journal =      j-TOCS,
  volume =       "31",
  number =       "2",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2465346.2465348",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Jun 1 11:24:04 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article investigates the transient use of free
                 local storage for improving performance in VM-based
                 mobile computing systems implemented as thick clients
                 on host PCs. We use the term TransientPC systems to
                 refer to these types of systems. The solution we
                 propose, called TransPart, uses the higher-performing
                 local storage of host hardware to speed up
                 performance-critical operations. Our solution
                 constructs a virtual storage device on demand (which we
                 call transient storage ) by borrowing free disk blocks
                 from the host's storage. In this article, we present
                 the design, implementation, and evaluation of a
                 TransPart prototype, which requires no modifications to
                 the software or hardware of a host computer.
                 Experimental results confirm that TransPart offers low
                 overhead and startup cost, while improving user
                 experience.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lee:2013:ETB,
  author =       "Yunsup Lee and Rimas Avizienis and Alex Bishara and
                 Richard Xia and Derek Lockhart and Christopher Batten
                 and Krste Asanovi{\'c}",
  title =        "Exploring the Tradeoffs between Programmability and
                 Efficiency in Data-Parallel Accelerators",
  journal =      j-TOCS,
  volume =       "31",
  number =       "3",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2491464",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Aug 28 17:03:36 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We present a taxonomy and modular implementation
                 approach for data-parallel accelerators, including the
                 MIMD, vector-SIMD, subword-SIMD, SIMT, and
                 vector-thread (VT) architectural design patterns. We
                 introduce Maven, a new VT microarchitecture based on
                 the traditional vector-SIMD microarchitecture, that is
                 considerably simpler to implement and easier to program
                 than previous VT designs. Using an extensive
                 design-space exploration of full VLSI implementations
                 of many accelerator design points, we evaluate the
                 varying tradeoffs between programmability and
                 implementation efficiency among the MIMD, vector-SIMD,
                 and VT patterns on a workload of compiled
                 microbenchmarks and application kernels. We find the
                 vector cores provide greater efficiency than the MIMD
                 cores, even on fairly irregular kernels. Our results
                 suggest that the Maven VT microarchitecture is superior
                 to the traditional vector-SIMD architecture, providing
                 both greater efficiency and easier programmability.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Gamage:2013:PRO,
  author =       "Sahan Gamage and Ramana Rao Kompella and Dongyan Xu
                 and Ardalan Kangarlou",
  title =        "Protocol Responsibility Offloading to Improve {TCP}
                 Throughput in Virtualized Environments",
  journal =      j-TOCS,
  volume =       "31",
  number =       "3",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2491463",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Aug 28 17:03:36 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Virtualization is a key technology that powers cloud
                 computing platforms such as Amazon EC2. Virtual machine
                 (VM) consolidation, where multiple VMs share a physical
                 host, has seen rapid adoption in practice, with
                 increasingly large numbers of VMs per machine and per
                 CPU core. Our investigations, however, suggest that the
                 increasing degree of VM consolidation has serious
                 negative effects on the VMs' TCP performance. As
                 multiple VMs share a given CPU, the scheduling
                 latencies, which can be in the order of tens of
                 milliseconds, substantially increase the typically
                 submillisecond round-trip times (RTTs) for TCP
                 connections in a datacenter, causing significant
                 degradation in throughput. In this article, we propose
                 a lightweight solution, called vPRO, that (a) offloads
                 the VM's TCP congestion control function to the driver
                 domain to improve TCP transmit performance; and (b)
                 offloads TCP acknowledgment functionality to the driver
                 domain to improve the TCP receive performance. Our
                 evaluation of a vPRO prototype on Xen suggests that
                 vPRO substantially improves TCP receive and transmit
                 throughputs with minimal per-packet CPU overhead. We
                 further show that the higher TCP throughput leads to
                 improvement in application-level performance, via
                 experiments with Apache Olio, a Web 2.0 cloud
                 application, and Intel MPI benchmark.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Corbett:2013:SGG,
  author =       "James C. Corbett and Jeffrey Dean and Michael Epstein
                 and Andrew Fikes and Christopher Frost and J. J. Furman
                 and Sanjay Ghemawat and Andrey Gubarev and Christopher
                 Heiser and Peter Hochschild and Wilson Hsieh and
                 Sebastian Kanthak and Eugene Kogan and Hongyi Li and
                 Alexander Lloyd and Sergey Melnik and David Mwaura and
                 David Nagle and Sean Quinlan and Rajesh Rao and Lindsay
                 Rolig and Yasushi Saito and Michal Szymaniak and
                 Christopher Taylor and Ruth Wang and Dale Woodford",
  title =        "{Spanner}: {Google}'s Globally Distributed Database",
  journal =      j-TOCS,
  volume =       "31",
  number =       "3",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2491245",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Aug 28 17:03:36 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Spanner is Google's scalable, multiversion, globally
                 distributed, and synchronously replicated database. It
                 is the first system to distribute data at global scale
                 and support externally-consistent distributed
                 transactions. This article describes how Spanner is
                 structured, its feature set, the rationale underlying
                 various design decisions, and a novel time API that
                 exposes clock uncertainty. This API and its
                 implementation are critical to supporting external
                 consistency and a variety of powerful features:
                 nonblocking reads in the past, lock-free snapshot
                 transactions, and atomic schema changes, across all of
                 Spanner.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Mowry:2013:E,
  author =       "Todd C. Mowry",
  title =        "Editorial",
  journal =      j-TOCS,
  volume =       "31",
  number =       "4",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2542150.2542151",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Dec 17 17:17:06 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Balakrishnan:2013:CDS,
  author =       "Mahesh Balakrishnan and Dahlia Malkhi and John D.
                 Davis and Vijayan Prabhakaran and Michael Wei and Ted
                 Wobber",
  title =        "{CORFU}: a distributed shared log",
  journal =      j-TOCS,
  volume =       "31",
  number =       "4",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2535930",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Dec 17 17:17:06 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "CORFU is a global log which clients can append-to and
                 read-from over a network. Internally, CORFU is
                 distributed over a cluster of machines in such a way
                 that there is no single I/O bottleneck to either
                 appends or reads. Data is fully replicated for fault
                 tolerance, and a modest cluster of about 16--32
                 machines with SSD drives can sustain 1 million 4-KByte
                 operations per second. The CORFU log enabled the
                 construction of a variety of distributed applications
                 that require strong consistency at high speeds, such as
                 databases, transactional key-value stores, replicated
                 state machines, and metadata services.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bojnordi:2013:PMC,
  author =       "Mahdi Nazm Bojnordi and Engin Ipek",
  title =        "A programmable memory controller for the {DDRx}
                 interfacing standards",
  journal =      j-TOCS,
  volume =       "31",
  number =       "4",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2534845",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Dec 17 17:17:06 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Modern memory controllers employ sophisticated address
                 mapping, command scheduling, and power management
                 optimizations to alleviate the adverse effects of DRAM
                 timing and resource constraints on system performance.
                 A promising way of improving the versatility and
                 efficiency of these controllers is to make them
                 programmable-a proven technique that has seen wide use
                 in other control tasks, ranging from DMA scheduling to
                 NAND Flash and directory control. Unfortunately, the
                 stringent latency and throughput requirements of modern
                 DDRx devices have rendered such programmability largely
                 impractical, confining DDRx controllers to
                 fixed-function hardware. This article presents the
                 instruction set architecture (ISA) and hardware
                 implementation of PARDIS, a programmable memory
                 controller that can meet the performance requirements
                 of a high-speed DDRx interface. The proposed controller
                 is evaluated by mapping previously proposed DRAM
                 scheduling, address mapping, refresh scheduling, and
                 power management algorithms onto PARDIS. Simulation
                 results show that the average performance of PARDIS
                 comes within 8\% of fixed-function hardware for each of
                 these techniques; moreover, by enabling
                 application-specific optimizations, PARDIS improves
                 system performance by 6 to 17\% and reduces DRAM energy
                 by 9 to 22\% over four existing memory controllers.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Delimitrou:2013:QAS,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{QoS}-Aware scheduling in heterogeneous datacenters
                 with {Paragon}",
  journal =      j-TOCS,
  volume =       "31",
  number =       "4",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2013",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2556583",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Dec 17 17:17:06 MST 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Large-scale datacenters (DCs) host tens of thousands
                 of diverse applications each day. However, interference
                 between colocated workloads and the difficulty of
                 matching applications to one of the many hardware
                 platforms available can degrade performance, violating
                 the quality of service (QoS) guarantees that many cloud
                 workloads require. While previous work has identified
                 the impact of heterogeneity and interference, existing
                 solutions are computationally intensive, cannot be
                 applied online, and do not scale beyond a few
                 applications. We present Paragon, an online and
                 scalable DC scheduler that is heterogeneity- and
                 interference-aware. Paragon is derived from robust
                 analytical methods, and instead of profiling each
                 application in detail, it leverages information the
                 system already has about applications it has previously
                 seen. It uses collaborative filtering techniques to
                 quickly and accurately classify an unknown incoming
                 workload with respect to heterogeneity and interference
                 in multiple shared resources. It does so by identifying
                 similarities to previously scheduled applications. The
                 classification allows Paragon to greedily schedule
                 applications in a manner that minimizes interference
                 and maximizes server utilization. After the initial
                 application placement, Paragon monitors application
                 behavior and adjusts the scheduling decisions at
                 runtime to avoid performance degradations.
                 Additionally, we design ARQ, a multiclass admission
                 control protocol that constrains application waiting
                 time. ARQ queues applications in separate classes based
                 on the type of resources they need and avoids long
                 queueing delays for easy-to-satisfy workloads in
                 highly-loaded scenarios. Paragon scales to tens of
                 thousands of servers and applications with marginal
                 scheduling overheads in terms of time or state. We
                 evaluate Paragon with a wide range of workload
                 scenarios, on both small and large-scale systems,
                 including 1,000 servers on EC2. For a 2,500-workload
                 scenario, Paragon enforces performance guarantees for
                 91\% of applications, while significantly improving
                 utilization. In comparison, heterogeneity-oblivious,
                 interference-oblivious, and least-loaded schedulers
                 only provide similar guarantees for 14\%, 11\%, and 3\%
                 of workloads. The differences are more striking in
                 oversubscribed scenarios where resource efficiency is
                 more critical.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Silberstein:2014:GIF,
  author =       "Mark Silberstein and Bryan Ford and Idit Keidar and
                 Emmett Witchel",
  title =        "{GPUfs}: Integrating a file system with {GPUs}",
  journal =      j-TOCS,
  volume =       "32",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2553081",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Feb 27 12:15:46 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "As GPU hardware becomes increasingly general-purpose,
                 it is quickly outgrowing the traditional, constrained
                 GPU-as-coprocessor programming model. This article
                 advocates for extending standard operating system
                 services and abstractions to GPUs in order to
                 facilitate program development and enable harmonious
                 integration of GPUs in computing systems. As an
                 example, we describe the design and implementation of
                 GPUFs, a software layer which provides operating system
                 support for accessing host files directly from GPU
                 programs. GPUFs provides a POSIX-like API, exploits GPU
                 parallelism for efficiency, and optimizes GPU file
                 access by extending the host CPU's buffer cache into
                 GPU memory. Our experiments, based on a set of real
                 benchmarks adapted to use our file system, demonstrate
                 the feasibility and benefits of the GPUFs approach. For
                 example, a self-contained GPU program that searches for
                 a set of strings throughout the Linux kernel source
                 tree runs over seven times faster than on an eight-core
                 CPU.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Klein:2014:CFV,
  author =       "Gerwin Klein and June Andronick and Kevin Elphinstone
                 and Toby Murray and Thomas Sewell and Rafal Kolanski
                 and Gernot Heiser",
  title =        "Comprehensive formal verification of an {OS}
                 microkernel",
  journal =      j-TOCS,
  volume =       "32",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2560537",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Feb 27 12:15:46 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We present an in-depth coverage of the comprehensive
                 machine-checked formal verification of seL4, a
                 general-purpose operating system microkernel. We
                 discuss the kernel design we used to make its
                 verification tractable. We then describe the functional
                 correctness proof of the kernel's C implementation and
                 we cover further steps that transform this result into
                 a comprehensive formal verification of the kernel: a
                 formally verified IPC fastpath, a proof that the binary
                 code of the kernel correctly implements the C
                 semantics, a proof of correct access-control
                 enforcement, a proof of information-flow
                 noninterference, a sound worst-case execution time
                 analysis of the binary, and an automatic initialiser
                 for user-level systems that connects kernel-level
                 access-control enforcement with reasoning about system
                 behaviour. We summarise these results and show how they
                 integrate to form a coherent overall analysis, backed
                 by machine-checked, end-to-end theorems. The seL4
                 microkernel is currently not just the only
                 general-purpose operating system kernel that is fully
                 formally verified to this degree. It is also the only
                 example of formal proof of this scale that is kept
                 current as the requirements, design and implementation
                 of the system evolve over almost a decade. We report on
                 our experience in maintaining this evolving formally
                 verified code base.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Guevara:2014:MMM,
  author =       "Marisabel Guevara and Benjamin Lubin and Benjamin C.
                 Lee",
  title =        "Market mechanisms for managing datacenters with
                 heterogeneous microarchitectures",
  journal =      j-TOCS,
  volume =       "32",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2541258",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Feb 27 12:15:46 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Specialization of datacenter resources brings
                 performance and energy improvements in response to the
                 growing scale and diversity of cloud applications. Yet
                 heterogeneous hardware adds complexity and volatility
                 to latency-sensitive applications. A resource
                 allocation mechanism that leverages architectural
                 principles can overcome both of these obstacles. We
                 integrate research in heterogeneous architectures with
                 recent advances in multi-agent systems. Embedding
                 architectural insight into proxies that bid on behalf
                 of applications, a market effectively allocates
                 hardware to applications with diverse preferences and
                 valuations. Exploring a space of heterogeneous
                 datacenter configurations, which mix server-class Xeon
                 and mobile-class Atom processors, we find an optimal
                 heterogeneous balance that improves both welfare and
                 energy-efficiency. We further design and evaluate
                 twelve design points along the Xeon-to-Atom spectrum,
                 and find that a mix of three processor architectures
                 achieves a $ 12 \times $ reduction in response time
                 violations relative to equal-power homogeneous
                 systems.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Palix:2014:FL,
  author =       "Nicolas Palix and Gael Thomas and Suman Saha and
                 Christophe Calv{\`e}s and Gilles Muller and Julia
                 Lawall",
  title =        "Faults in {Linux 2.6}",
  journal =      j-TOCS,
  volume =       "32",
  number =       "2",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2619090",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 7 16:54:52 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/linux.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "In August 2011, Linux entered its third decade. Ten
                 years before, Chou et al. published a study of faults
                 found by applying a static analyzer to Linux versions
                 1.0 through 2.4.1. A major result of their work was
                 that the drivers directory contained up to 7 times more
                 of certain kinds of faults than other directories. This
                 result inspired numerous efforts on improving the
                 reliability of driver code. Today, Linux is used in a
                 wider range of environments, provides a wider range of
                 services, and has adopted a new development and release
                 model. What has been the impact of these changes on
                 code quality? To answer this question, we have
                 transported Chou et al.'s experiments to all versions
                 of Linux 2.6 released between 2003 and 2011. We find
                 that Linux has more than doubled in size during this
                 period, but the number of faults per line of code has
                 been decreasing. Moreover, the fault rate of drivers is
                 now below that of other directories, such as arch.
                 These results can guide further development and
                 research efforts for the decade to come. To allow
                 updating these results as Linux evolves, we define our
                 experimental protocol and make our checkers
                 available.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Enck:2014:TIF,
  author =       "William Enck and Peter Gilbert and Seungyeop Han and
                 Vasant Tendulkar and Byung-Gon Chun and Landon P. Cox
                 and Jaeyeon Jung and Patrick McDaniel and Anmol N.
                 Sheth",
  title =        "{TaintDroid}: an Information-Flow Tracking System for
                 Realtime Privacy Monitoring on {Smartphones}",
  journal =      j-TOCS,
  volume =       "32",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2619091",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 7 16:54:52 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Today's smartphone operating systems frequently fail
                 to provide users with visibility into how third-party
                 applications collect and share their private data. We
                 address these shortcomings with TaintDroid, an
                 efficient, system-wide dynamic taint tracking and
                 analysis system capable of simultaneously tracking
                 multiple sources of sensitive data. TaintDroid enables
                 realtime analysis by leveraging Android's virtualized
                 execution environment. TaintDroid incurs only 32\%
                 performance overhead on a CPU-bound microbenchmark and
                 imposes negligible overhead on interactive third-party
                 applications. Using TaintDroid to monitor the behavior
                 of 30 popular third-party Android applications, in our
                 2010 study we found 20 applications potentially misused
                 users' private information; so did a similar fraction
                 of the tested applications in our 2012 study.
                 Monitoring the flow of privacy-sensitive data with
                 TaintDroid provides valuable input for smartphone users
                 and security service firms seeking to identify
                 misbehaving applications.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Yu:2014:OBS,
  author =       "Young Jin Yu and Dong In Shin and Woong Shin and Nae
                 Young Song and Jae Woo Choi and Hyeong Seog Kim and
                 Hyeonsang Eom and Heon Young Yeom",
  title =        "Optimizing the Block {I/O} Subsystem for Fast Storage
                 Devices",
  journal =      j-TOCS,
  volume =       "32",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2619092",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 7 16:54:52 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Fast storage devices are an emerging solution to
                 satisfy data-intensive applications. They provide high
                 transaction rates for DBMS, low response times for Web
                 servers, instant on-demand paging for applications with
                 large memory footprints, and many similar advantages
                 for performance-hungry applications. In spite of the
                 benefits promised by fast hardware, modern operating
                 systems are not yet structured to take advantage of the
                 hardware's full potential. The software overhead caused
                 by an OS, negligible in the past, adversely impacts
                 application performance, lessening the advantage of
                 using such hardware. Our analysis demonstrates that the
                 overheads from the traditional storage-stack design are
                 significant and cannot easily be overcome without
                 modifying the hardware interface and adding new
                 capabilities to the operating system. In this article,
                 we propose six optimizations that enable an OS to fully
                 exploit the performance characteristics of fast storage
                 devices. With the support of new hardware interfaces,
                 our optimizations minimize per-request latency by
                 streamlining the I/O path and amortize per-request
                 latency by maximizing parallelism inside the device. We
                 demonstrate the impact on application performance
                 through well-known storage benchmarks run against a
                 Linux kernel with a customized SSD. We find that
                 eliminating context switches in the I/O path decreases
                 the software overhead of an I/O request from 20
                 microseconds to 5 microseconds and a new request merge
                 scheme called Temporal Merge enables the OS to achieve
                 87\% to 100\% of peak device performance, regardless of
                 request access patterns or types. Although the
                 performance improvement by these optimizations on a
                 standard SATA-based SSD is marginal (because of its
                 limited interface and relatively high response times),
                 our sensitivity analysis suggests that future SSDs with
                 lower response times will benefit from these changes.
                 The effectiveness of our optimizations encourages
                 discussion between the OS community and storage vendors
                 about future device interfaces for fast storage
                 devices.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Samadi:2014:SPS,
  author =       "Mehrzad Samadi and Janghaeng Lee and D. Anoushe
                 Jamshidi and Scott Mahlke and Amir Hormati",
  title =        "Scaling Performance via Self-Tuning Approximation for
                 Graphics Engines",
  journal =      j-TOCS,
  volume =       "32",
  number =       "3",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2631913",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:28 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Approximate computing, where computation accuracy is
                 traded off for better performance or higher data
                 throughput, is one solution that can help data
                 processing keep pace with the current and growing
                 abundance of information. For particular domains, such
                 as multimedia and learning algorithms, approximation is
                 commonly used today. We consider automation to be
                 essential to provide transparent approximation, and we
                 show that larger benefits can be achieved by
                 constructing the approximation techniques to fit the
                 underlying hardware. Our target platform is the GPU
                 because of its high performance capabilities and
                 difficult programming challenges that can be alleviated
                 with proper automation. Our approach --- SAGE ---
                 combines a static compiler that automatically generates
                 a set of CUDA kernels with varying levels of
                 approximation with a runtime system that iteratively
                 selects among the available kernels to achieve speedup
                 while adhering to a target output quality set by the
                 user. The SAGE compiler employs three optimization
                 techniques to generate approximate kernels that exploit
                 the GPU microarchitecture: selective discarding of
                 atomic operations, data packing, and thread fusion.
                 Across a set of machine learning and image processing
                 kernels, SAGE's approximation yields an average of 2.5$
                 \times $ speedup with less than 10\% quality loss
                 compared to the accurate execution on a NVIDIA GTX 560
                 GPU.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Wu:2014:EAH,
  author =       "Lisa Wu and Orestis Polychroniou and Raymond J. Barker
                 and Martha A. Kim and Kenneth A. Ross",
  title =        "Energy Analysis of Hardware and Software Range
                 Partitioning",
  journal =      j-TOCS,
  volume =       "32",
  number =       "3",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2638550",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:28 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Data partitioning is a critical operation for
                 manipulating large datasets because it subdivides tasks
                 into pieces that are more amenable to efficient
                 processing. It is often the limiting factor in database
                 performance and represents a significant fraction of
                 the overall runtime of large data queries. This article
                 measures the performance and energy of state-of-the-art
                 software partitioners, and describes and evaluates a
                 hardware range partitioner that further improves
                 efficiency. The software implementation is broken into
                 two phases, allowing separate analysis of the partition
                 function computation and data shuffling costs. Although
                 range partitioning is commonly thought to be more
                 expensive than simpler strategies such as hash
                 partitioning, our measurements indicate that careful
                 data movement and optimization of the partition
                 function can allow it to approach the throughput and
                 energy consumption of hash or radix partitioning. For
                 further acceleration, we describe a hardware range
                 partitioner, or HARP, a streaming framework that offers
                 a seamless execution environment for this and other
                 streaming accelerators, and a detailed analysis of a
                 32nm physical design that matches the throughput of
                 four to eight software threads while consuming just
                 6.9\% of the area and 4.3\% of the power of a Xeon core
                 in the same technology generation.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Sampson:2014:ASS,
  author =       "Adrian Sampson and Jacob Nelson and Karin Strauss and
                 Luis Ceze",
  title =        "Approximate Storage in Solid-State Memories",
  journal =      j-TOCS,
  volume =       "32",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2014",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2644808",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:28 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Memories today expose an all-or-nothing correctness
                 model that incurs significant costs in performance,
                 energy, area, and design complexity. But not all
                 applications need high-precision storage for all of
                 their data structures all of the time. This article
                 proposes mechanisms that enable applications to store
                 data approximately and shows that doing so can improve
                 the performance, lifetime, or density of solid-state
                 memories. We propose two mechanisms. The first allows
                 errors in multilevel cells by reducing the number of
                 programming pulses used to write them. The second
                 mechanism mitigates wear-out failures and extends
                 memory endurance by mapping approximate data onto
                 blocks that have exhausted their hardware error
                 correction resources. Simulations show that
                 reduced-precision writes in multilevel phase-change
                 memory cells can be 1.7 $ \times $ faster on average
                 and using failed blocks can improve array lifetime by
                 23\% on average with quality loss under 10\%.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Clements:2015:SCR,
  author =       "Austin T. Clements and M. Frans Kaashoek and Nickolai
                 Zeldovich and Robert T. Morris and Eddie Kohler",
  title =        "The Scalable Commutativity Rule: Designing Scalable
                 Software for Multicore Processors",
  journal =      j-TOCS,
  volume =       "32",
  number =       "4",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2699681",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:30 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "What opportunities for multicore scalability are
                 latent in software interfaces, such as system call
                 APIs? Can scalability challenges and opportunities be
                 identified even before any implementation exists,
                 simply by considering interface specifications? To
                 answer these questions, we introduce the scalable
                 commutativity rule: whenever interface operations
                 commute, they can be implemented in a way that scales.
                 This rule is useful throughout the development process
                 for scalable multicore software, from the interface
                 design through implementation, testing, and evaluation.
                 This article formalizes the scalable commutativity
                 rule. This requires defining a novel form of
                 commutativity, SIM commutativity, that lets the rule
                 apply even to complex and highly stateful software
                 interfaces. We also introduce a suite of software
                 development tools based on the rule. Our Commuter tool
                 accepts high-level interface models, generates tests of
                 interface operations that commute and hence could
                 scale, and uses these tests to systematically evaluate
                 the scalability of implementations. We apply Commuter
                 to a model of 18 POSIX file and virtual memory system
                 operations. Using the resulting 26,238 scalability
                 tests, Commuter highlights Linux kernel problems
                 previously observed to limit application scalability
                 and identifies previously unknown bottlenecks that may
                 be triggered by future workloads or hardware. Finally,
                 we apply the scalable commutativity rule and Commuter
                 to the design and implementation sv6, a new POSIX-like
                 operating system. sv6's novel file and virtual memory
                 system designs enable it to scale for 99\% of the tests
                 generated by Commuter. These results translate to
                 linear scalability on an 80-core x86 machine for
                 applications built on sv6's commutative operations.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Nair:2015:MMA,
  author =       "Arun Arvind Nair and Stijn Eyerman and Jian Chen and
                 Lizy Kurian John and Lieven Eeckhout",
  title =        "Mechanistic Modeling of Architectural Vulnerability
                 Factor",
  journal =      j-TOCS,
  volume =       "32",
  number =       "4",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2669364",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:30 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Reliability to soft errors is a significant design
                 challenge in modern microprocessors owing to an
                 exponential increase in the number of transistors on
                 chip and the reduction in operating voltages with each
                 process generation. Architectural Vulnerability Factor
                 (AVF) modeling using microarchitectural simulators
                 enables architects to make informed performance, power,
                 and reliability tradeoffs. However, such simulators are
                 time-consuming and do not reveal the microarchitectural
                 mechanisms that influence AVF. In this article, we
                 present an accurate first-order mechanistic analytical
                 model to compute AVF, developed using the first
                 principles of an out-of-order superscalar execution.
                 This model provides insight into the fundamental
                 interactions between the workload and microarchitecture
                 that together influence AVF. We use the model to
                 perform design space exploration, parametric sweeps,
                 and workload characterization for AVF.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Aublin:2015:NBP,
  author =       "Pierre-Louis Aublin and Rachid Guerraoui and Nikola
                 Knezevi{\'c} and Vivien Qu{\'e}ma and Marko
                 Vukoli{\'c}",
  title =        "The Next 700 {BFT} Protocols",
  journal =      j-TOCS,
  volume =       "32",
  number =       "4",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2658994",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 21 07:18:30 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "We present Abstract (ABortable STate mAChine
                 replicaTion), a new abstraction for designing and
                 reconfiguring generalized replicated state machines
                 that are, unlike traditional state machines, allowed to
                 abort executing a client's request if ``something goes
                 wrong.'' Abstract can be used to considerably simplify
                 the incremental development of efficient Byzantine
                 fault-tolerant state machine replication ( BFT)
                 protocols that are notorious for being difficult to
                 develop. In short, we treat a BFT protocol as a
                 composition of Abstract instances. Each instance is
                 developed and analyzed independently and optimized for
                 specific system conditions. We illustrate the power of
                 Abstract through several interesting examples. We first
                 show how Abstract can yield benefits of a
                 state-of-the-art BFT protocol in a less painful and
                 error-prone manner. Namely, we develop AZyzzyva, a new
                 protocol that mimics the celebrated best-case behavior
                 of Zyzzyva using less than 35\% of the Zyzzyva code. To
                 cover worst-case situations, our abstraction enables
                 one to use in AZyzzyva any existing BFT protocol. We
                 then present Aliph, a new BFT protocol that outperforms
                 previous BFT protocols in terms of both latency (by up
                 to 360\%) and throughput (by up to 30\%). Finally, we
                 present R-Aliph, an implementation of Aliph that is
                 robust, that is, whose performance degrades gracefully
                 in the presence of Byzantine replicas and Byzantine
                 clients.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Wang:2015:DAU,
  author =       "Xi Wang and Nickolai Zeldovich and M. Frans Kaashoek
                 and Armando Solar-Lezama",
  title =        "A Differential Approach to Undefined Behavior
                 Detection",
  journal =      j-TOCS,
  volume =       "33",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2699678",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Mar 13 07:03:25 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "This article studies undefined behavior arising in
                 systems programming languages such as C/C++. Undefined
                 behavior bugs lead to unpredictable and subtle systems
                 behavior, and their effects can be further amplified by
                 compiler optimizations. Undefined behavior bugs are
                 present in many systems, including the Linux kernel and
                 the Postgres database. The consequences range from
                 incorrect functionality to missing security checks.
                 This article proposes a formal and practical approach
                 that finds undefined behavior bugs by finding
                 ``unstable code'' in terms of optimizations that
                 leverage undefined behavior. Using this approach, we
                 introduce a new static checker called Stack that
                 precisely identifies undefined behavior bugs. Applying
                 Stack to widely used systems has uncovered 161 new bugs
                 that have been confirmed and fixed by developers.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bila:2015:EOP,
  author =       "Nilton Bila and Eric J. Wright and Eyal {De Lara} and
                 Kaustubh Joshi and H. Andr{\'e}s Lagar-Cavilla and
                 Eunbyung Park and Ashvin Goel and Matti Hiltunen and
                 Mahadev Satyanarayanan",
  title =        "Energy-Oriented Partial Desktop Virtual Machine
                 Migration",
  journal =      j-TOCS,
  volume =       "33",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2699683",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Mar 13 07:03:25 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Modern offices are crowded with personal computers.
                 While studies have shown these to be idle most of the
                 time, they remain powered, consuming up to 60\% of
                 their peak power. Hardware-based solutions engendered
                 by PC vendors (e.g., low-power states, Wake-on-LAN)
                 have proved unsuccessful because, in spite of user
                 inactivity, these machines often need to remain network
                 active in support of background applications that
                 maintain network presence. Recent proposals have
                 advocated the use of consolidation of idle desktop
                 Virtual Machines (VMs). However, desktop VMs are often
                 large, requiring gigabytes of memory. Consolidating
                 such VMs creates large network transfers lasting in the
                 order of minutes and utilizes server memory
                 inefficiently. When multiple VMs migrate concurrently,
                 networks become congested, and the resulting migration
                 latencies are prohibitive. We present partial VM
                 migration, an approach that transparently migrates only
                 the working set of an idle VM. It creates a partial
                 replica of the desktop VM on the consolidation server
                 by copying only VM metadata, and it transfers pages to
                 the server on-demand, as the VM accesses them. This
                 approach places desktop PCs in low-power mode when
                 inactive and switches them to running mode when pages
                 are needed by the VM running on the consolidation
                 server. To ensure that desktops save energy, we have
                 developed sleep scheduling and prefetching algorithms,
                 as well as the context-aware selective resume
                 framework, a novel approach to reduce the latency of
                 power mode transition operations in commodity PCs.
                 Jettison, our software prototype of partial VM
                 migration for off-the-shelf PCs, can deliver 44--91\%
                 energy savings during idle periods of at least 10
                 minutes, while providing low migration latencies of
                 about 4 seconds and migrating minimal state that is
                 under an order of magnitude of the VM's memory
                 footprint.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Blem:2015:IWU,
  author =       "Emily Blem and Jaikrishnan Menon and Thiruvengadam
                 Vijayaraghavan and Karthikeyan Sankaralingam",
  title =        "{ISA} Wars: Understanding the Relevance of {ISA} being
                 {RISC} or {CISC} to Performance, Power, and Energy on
                 Modern Architectures",
  journal =      j-TOCS,
  volume =       "33",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2699682",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Mar 13 07:03:25 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "RISC versus CISC wars raged in the 1980s when chip
                 area and processor design complexity were the primary
                 constraints and desktops and servers exclusively
                 dominated the computing landscape. Today, energy and
                 power are the primary design constraints and the
                 computing landscape is significantly different: Growth
                 in tablets and smartphones running ARM (a RISC ISA) is
                 surpassing that of desktops and laptops running x86 (a
                 CISC ISA). Furthermore, the traditionally low-power ARM
                 ISA is entering the high-performance server market,
                 while the traditionally high-performance x86 ISA is
                 entering the mobile low-power device market. Thus, the
                 question of whether ISA plays an intrinsic role in
                 performance or energy efficiency is becoming important
                 again, and we seek to answer this question through a
                 detailed measurement-based study on real hardware
                 running real applications. We analyze measurements on
                 seven platforms spanning three ISAs (MIPS, ARM, and
                 x86) over workloads spanning mobile, desktop, and
                 server computing. Our methodical investigation
                 demonstrates the role of ISA in modern microprocessors'
                 performance and energy efficiency. We find that ARM,
                 MIPS, and x86 processors are simply engineering design
                 points optimized for different levels of performance,
                 and there is nothing fundamentally more energy
                 efficient in one ISA class or the other. The ISA being
                 RISC or CISC seems irrelevant.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lin:2015:KMO,
  author =       "Felix Xiaozhu Lin and Zhen Wang and Lin Zhong",
  title =        "{K2}: a Mobile Operating System for Heterogeneous
                 Coherence Domains",
  journal =      j-TOCS,
  volume =       "33",
  number =       "2",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2699676",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jun 10 11:00:03 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Mobile System-on-Chips (SoC) that incorporate
                 heterogeneous coherence domains promise high energy
                 efficiency to a wide range of mobile applications, yet
                 are difficult to program. To exploit the architecture,
                 a desirable, yet missing capability is to replicate
                 operating system (OS) services over multiple coherence
                 domains with minimum inter-domain communication. In
                 designing such an OS, we set three goals: to ease
                 application development, to simplify OS engineering,
                 and to preserve the current OS performance. To this
                 end, we identify a shared-most OS model for multiple
                 coherence domains: creating per-domain instances of
                 core OS services with no shared state, while enabling
                 other extended OS services to share state across
                 domains. To test the model, we build K2, a prototype OS
                 on the TI OMAP4 SoC, by reusing most of the Linux 3.4
                 source. K2 presents a single system image to
                 applications with its two kernels running on top of the
                 two coherence domains of OMAP4. The two kernels have
                 independent instances of core OS services, such as page
                 allocation and interrupt management, as coordinated by
                 K2; the two kernels share most extended OS services,
                 such as device drivers, whose state is kept coherent
                 transparently by K2. Despite platform constraints and
                 unoptimized code, K2 improves energy efficiency for
                 light OS workloads by 8x--10x, while incurring less
                 than 9\% performance overhead for two device drivers
                 shared between kernels. Our experiences with K2 show
                 that the shared-most model is promising.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Johansen:2015:FSS,
  author =       "H{\aa}vard D. Johansen and Robbert {Van Renesse} and
                 Ymir Vigfusson and Dag Johansen",
  title =        "{Fireflies}: a Secure and Scalable Membership and
                 Gossip Service",
  journal =      j-TOCS,
  volume =       "33",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2701418",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jun 10 11:00:03 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "An attacker who controls a computer in an overlay
                 network can effectively control the entire overlay
                 network if the mechanism managing membership
                 information can successfully be targeted. This article
                 describes Fireflies, an overlay network protocol that
                 fights such attacks by organizing members in a
                 verifiable pseudorandom structure so that an intruder
                 cannot incorrectly modify the membership views of
                 correct members. Fireflies provides each member with a
                 view of the entire membership, and supports networks
                 with moderate total churn. We evaluate Fireflies using
                 both simulations and PlanetLab to show that Fireflies
                 is a practical approach for secure membership
                 maintenance in such networks.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Chen:2015:SFA,
  author =       "Tianshi Chen and Shijin Zhang and Shaoli Liu and
                 Zidong Du and Tao Luo and Yuan Gao and Junjie Liu and
                 Dongsheng Wang and Chengyong Wu and Ninghui Sun and
                 Yunji Chen and Olivier Temam",
  title =        "A Small-Footprint Accelerator for Large-Scale Neural
                 Networks",
  journal =      j-TOCS,
  volume =       "33",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2701417",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jun 10 11:00:03 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Machine-learning tasks are becoming pervasive in a
                 broad range of domains, and in a broad range of systems
                 (from embedded systems to data centers). At the same
                 time, a small set of machine-learning algorithms
                 (especially Convolutional and Deep Neural Networks,
                 i.e., CNNs and DNNs) are proving to be state-of-the-art
                 across many applications. As architectures evolve
                 toward heterogeneous multicores composed of a mix of
                 cores and accelerators, a machine-learning accelerator
                 can achieve the rare combination of efficiency (due to
                 the small number of target algorithms) and broad
                 application scope. Until now, most machine-learning
                 accelerator designs have been focusing on efficiently
                 implementing the computational part of the algorithms.
                 However, recent state-of-the-art CNNs and DNNs are
                 characterized by their large size. In this study, we
                 design an accelerator for large-scale CNNs and DNNs,
                 with a special emphasis on the impact of memory on
                 accelerator design, performance, and energy. We show
                 that it is possible to design an accelerator with a
                 high throughput, capable of performing 452 GOP/s (key
                 NN operations such as synaptic weight multiplications
                 and neurons outputs additions) in a small footprint of
                 3.02mm$^2$ and 485mW; compared to a 128-bit 2GHz SIMD
                 processor, the accelerator is $ 117.87 \times $ faster,
                 and it can reduce the total energy by $ 21.08 \times $.
                 The accelerator characteristics are obtained after
                 layout at 65nm. Such a high throughput in a small
                 footprint can open up the usage of state-of-the-art
                 machine-learning algorithms in a broad set of systems
                 and for a broad set of applications.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ousterhout:2015:RSS,
  author =       "John Ousterhout and Arjun Gopalan and Ashish Gupta and
                 Ankita Kejriwal and Collin Lee and Behnam Montazeri and
                 Diego Ongaro and Seo Jin Park and Henry Qin and Mendel
                 Rosenblum and Stephen Rumble and Ryan Stutsman and
                 Stephen Yang",
  title =        "The {RAMCloud} Storage System",
  journal =      j-TOCS,
  volume =       "33",
  number =       "3",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2806887",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Sep 14 10:11:30 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "RAMCloud is a storage system that provides low-latency
                 access to large-scale datasets. To achieve low latency,
                 RAMCloud stores all data in DRAM at all times. To
                 support large capacities (1PB or more), it aggregates
                 the memories of thousands of servers into a single
                 coherent key-value store. RAMCloud ensures the
                 durability of DRAM-based data by keeping backup copies
                 on secondary storage. It uses a uniform log-structured
                 mechanism to manage both DRAM and secondary storage,
                 which results in high performance and efficient memory
                 usage. RAMCloud uses a polling-based approach to
                 communication, bypassing the kernel to communicate
                 directly with NICs; with this approach, client
                 applications can read small objects from any RAMCloud
                 storage server in less than 5 $ \mu $ s, durable writes
                 of small objects take about 13.5 $ \mu $ s. RAMCloud
                 does not keep multiple copies of data online; instead,
                 it provides high availability by recovering from
                 crashes very quickly (1 to 2 seconds). RAMCloud's crash
                 recovery mechanism harnesses the resources of the
                 entire cluster working concurrently so that recovery
                 performance scales with cluster size.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Baumann:2015:SAU,
  author =       "Andrew Baumann and Marcus Peinado and Galen Hunt",
  title =        "Shielding Applications from an Untrusted Cloud with
                 {Haven}",
  journal =      j-TOCS,
  volume =       "33",
  number =       "3",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2799647",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Sep 14 10:11:30 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Today's cloud computing infrastructure requires
                 substantial trust. Cloud users rely on both the
                 provider's staff and its globally distributed
                 software/hardware platform not to expose any of their
                 private data. We introduce the notion of shielded
                 execution, which protects the confidentiality and
                 integrity of a program and its data from the platform
                 on which it runs (i.e., the cloud operator's OS, VM,
                 and firmware). Our prototype, Haven, is the first
                 system to achieve shielded execution of unmodified
                 legacy applications, including SQL Server and Apache,
                 on a commodity OS (Windows) and commodity hardware.
                 Haven leverages the hardware protection of Intel SGX to
                 defend against privileged code and physical attacks
                 such as memory probes, and also addresses the dual
                 challenges of executing unmodified legacy binaries and
                 protecting them from a malicious host. This work
                 motivated recent changes in the SGX specification.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lee:2015:SSK,
  author =       "Janghaeng Lee and Mehrzad Samadi and Yongjun Park and
                 Scott Mahlke",
  title =        "{SKMD}: Single Kernel on Multiple Devices for
                 Transparent {CPU--GPU} Collaboration",
  journal =      j-TOCS,
  volume =       "33",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2798725",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Sep 14 10:11:30 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Heterogeneous computing on CPUs and GPUs has
                 traditionally used fixed roles for each device: the GPU
                 handles data parallel work by taking advantage of its
                 massive number of cores while the CPU handles non
                 data-parallel work, such as the sequential code or data
                 transfer management. This work distribution can be a
                 poor solution as it underutilizes the CPU, has
                 difficulty generalizing beyond the single CPU-GPU
                 combination, and may waste a large fraction of time
                 transferring data. Further, CPUs are performance
                 competitive with GPUs on many workloads, thus simply
                 partitioning work based on the fixed roles may be a
                 poor choice. In this article, we present the
                 single-kernel multiple devices (SKMD) system, a
                 framework that transparently orchestrates collaborative
                 execution of a single data-parallel kernel across
                 multiple asymmetric CPUs and GPUs. The programmer is
                 responsible for developing a single data-parallel
                 kernel in OpenCL, while the system automatically
                 partitions the workload across an arbitrary set of
                 devices, generates kernels to execute the partial
                 workloads, and efficiently merges the partial outputs
                 together. The goal is performance improvement by
                 maximally utilizing all available resources to execute
                 the kernel. SKMD handles the difficult challenges of
                 exposed data transfer costs and the performance
                 variations GPUs have with respect to input size. On
                 real hardware, SKMD achieves an average speedup of 28\%
                 on a system with one multicore CPU and two asymmetric
                 GPUs compared to a fastest device execution strategy
                 for a set of popular OpenCL kernels.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Pellauer:2015:ECC,
  author =       "Michael Pellauer and Angshuman Parashar and Michael
                 Adler and Bushra Ahsan and Randy Allmon and Neal Crago
                 and Kermin Fleming and Mohit Gambhir and Aamer Jaleel
                 and Tushar Krishna and Daniel Lustig and Stephen Maresh
                 and Vladimir Pavlov and Rachid Rayess and Antonia Zhai
                 and Joel Emer",
  title =        "Efficient Control and Communication Paradigms for
                 Coarse-Grained Spatial Architectures",
  journal =      j-TOCS,
  volume =       "33",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2015",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2754930",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Sep 14 10:11:30 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "There has been recent interest in exploring the
                 acceleration of nonvectorizable workloads with
                 spatially programmed architectures that are designed to
                 efficiently exploit pipeline parallelism. Such an
                 architecture faces two main problems: how to
                 efficiently control each processing element (PE) in the
                 system, and how to facilitate inter-PE communication
                 without the overheads of traditional shared-memory
                 coherent memory. In this article, we explore solving
                 these problems using triggered instructions and
                 latency-insensitive channels. Triggered instructions
                 completely eliminate the program counter (PC) and allow
                 programs to transition concisely between states without
                 explicit branch instructions. Latency-insensitive
                 channels allow efficient communication of inter-PE
                 control information while simultaneously enabling
                 flexible code placement and improving tolerance for
                 variable events such as cache accesses. Together, these
                 approaches provide a unified mechanism to avoid
                 overserialized execution, essentially achieving the
                 effect of techniques such as dynamic instruction
                 reordering and multithreading. Our analysis shows that
                 a spatial accelerator using triggered instructions and
                 latency-insensitive channels can achieve 8 $ \times $
                 greater area-normalized performance than a traditional
                 general-purpose processor. Further analysis shows that
                 triggered control reduces the number of static and
                 dynamic instructions in the critical paths by 62\% and
                 64\%, respectively, over a PC-style baseline,
                 increasing the performance of the spatial programming
                 approach by 2.0 $ \times $.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Peter:2016:AOS,
  author =       "Simon Peter and Jialin Li and Irene Zhang and Dan R.
                 K. Ports and Doug Woos and Arvind Krishnamurthy and
                 Thomas Anderson and Timothy Roscoe",
  title =        "{Arrakis}: The Operating System Is the Control Plane",
  journal =      j-TOCS,
  volume =       "33",
  number =       "4",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2812806",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 6 06:45:30 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Recent device hardware trends enable a new approach to
                 the design of network server operating systems. In a
                 traditional operating system, the kernel mediates
                 access to device hardware by server applications to
                 enforce process isolation as well as network and disk
                 security. We have designed and implemented a new
                 operating system, Arrakis, that splits the traditional
                 role of the kernel in two. Applications have direct
                 access to virtualized I/O devices, allowing most I/O
                 operations to skip the kernel entirely, while the
                 kernel is re-engineered to provide network and disk
                 protection without kernel mediation of every operation.
                 We describe the hardware and software changes needed to
                 take advantage of this new abstraction, and we
                 illustrate its power by showing improvements of 2 to 5
                 $ \times $ in latency and 9 $ \times $ throughput for a
                 popular persistent NoSQL store relative to a well-tuned
                 Linux implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Kumar:2016:ASC,
  author =       "Rakesh Kumar and Alejandro Mart{\'\i}nez and Antonio
                 Gonz{\'a}lez",
  title =        "Assisting Static Compiler Vectorization with a
                 Speculative Dynamic Vectorizer in an {HW\slash SW}
                 Codesigned Environment",
  journal =      j-TOCS,
  volume =       "33",
  number =       "4",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2807694",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 6 06:45:30 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Compiler-based static vectorization is used widely to
                 extract data-level parallelism from
                 computation-intensive applications. Static
                 vectorization is very effective in vectorizing
                 traditional array-based applications. However,
                 compilers' inability to do accurate interprocedural
                 pointer disambiguation and interprocedural array
                 dependence analysis severely limits vectorization
                 opportunities. HW/SW codesigned processors provide an
                 excellent opportunity to optimize the applications at
                 runtime. The availability of dynamic application
                 behavior at runtime helps in capturing vectorization
                 opportunities generally missed by the compilers. This
                 article proposes to complement the static vectorization
                 with a speculative dynamic vectorizer in an HW/SW
                 codesigned processor. We present a speculative dynamic
                 vectorization algorithm that speculatively reorders
                 ambiguous memory references to uncover vectorization
                 opportunities. The speculative reordering of memory
                 instructions avoids the need for accurate
                 interprocedural pointer disambiguation and
                 interprocedural array dependence analysis. The hardware
                 checks for any memory dependence violation due to
                 speculative vectorization and takes corrective action
                 in case of violation. Our experiments show that the
                 combined (static + dynamic) vectorization approach
                 provides a $ 2 \times $ performance benefit compared to
                 the static GCC vectorization alone, for SPECFP2006.
                 Furthermore, the speculative dynamic vectorizer is able
                 to vectorize 48\% of the loops that ICC failed to
                 vectorize due to conservative dependence analysis in
                 the TSVC benchmark suite. Moreover, the dynamic
                 vectorization scheme is as effective in vectorization
                 of pointer-based applications as for the array-based
                 ones, whereas compilers lose significant vectorization
                 opportunities in pointer-based applications.
                 Furthermore, we show that speculation is not only a
                 luxury but also a necessity for runtime
                 vectorization.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lozi:2016:FPL,
  author =       "Jean-Pierre Lozi and Florian David and Ga{\"e}l Thomas
                 and Julia Lawall and Gilles Muller",
  title =        "Fast and Portable Locking for Multicore
                 Architectures",
  journal =      j-TOCS,
  volume =       "33",
  number =       "4",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2845079",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Jan 6 06:45:30 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The scalability of multithreaded applications on
                 current multicore systems is hampered by the
                 performance of lock algorithms, due to the costs of
                 access contention and cache misses. The main
                 contribution presented in this article is a new locking
                 technique, Remote Core Locking (RCL), that aims to
                 accelerate the execution of critical sections in legacy
                 applications on multicore architectures. The idea of
                 RCL is to replace lock acquisitions by optimized remote
                 procedure calls to a dedicated server hardware thread.
                 RCL limits the performance collapse observed with other
                 lock algorithms when many threads try to acquire a lock
                 concurrently and removes the need to transfer
                 lock-protected shared data to the hardware thread
                 acquiring the lock, because such data can typically
                 remain in the server's cache. Other contributions
                 presented in this article include a profiler that
                 identifies the locks that are the bottlenecks in
                 multithreaded applications and that can thus benefit
                 from RCL, and a reengineering tool that transforms
                 POSIX lock acquisitions into RCL locks. Eighteen
                 applications were used to evaluate RCL: the nine
                 applications of the SPLASH-2 benchmark suite, the seven
                 applications of the Phoenix 2 benchmark suite,
                 Memcached, and Berkeley DB with a TPC-C client. Eight
                 of these applications are unable to scale because of
                 locks and benefit from RCL on an x86 machine with four
                 AMD Opteron processors and 48 hardware threads. By
                 using RCL instead of Linux POSIX locks, performance is
                 improved by up to 2.5 times on Memcached, and up to
                 11.6 times on Berkeley DB with the TPC-C client. On a
                 SPARC machine with two Sun UltraSPARC T2+ processors
                 and 128 hardware threads, three applications benefit
                 from RCL. In particular, performance is improved by up
                 to 1.3 times with respect to Solaris POSIX locks on
                 Memcached, and up to 7.9 times on Berkeley DB with the
                 TPC-C client.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Heiser:2016:LML,
  author =       "Gernot Heiser and Kevin Elphinstone",
  title =        "{L4} Microkernels: The Lessons from 20 Years of
                 Research and Deployment",
  journal =      j-TOCS,
  volume =       "34",
  number =       "1",
  pages =        "1:1--1:29",
  month =        apr,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2893177",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The L4 microkernel has undergone 20 years of use and
                 evolution. It has an active user and developer
                 community, and there are commercial versions that are
                 deployed on a large scale and in safety-critical
                 systems. In this article we examine the lessons learnt
                 in those 20 years about microkernel design and
                 implementation. We revisit the L4 design articles and
                 examine the evolution of design and implementation from
                 the original L4 to the latest generation of L4 kernels.
                 We specifically look at seL4, which has pushed the L4
                 model furthest and was the first OS kernel to undergo a
                 complete formal verification of its implementation as
                 well as a sound analysis of worst-case execution times.
                 We demonstrate that while much has changed, the
                 fundamental principles of minimality, generality, and
                 high inter-process communication (IPC) performance
                 remain the main drivers of design and implementation
                 decisions.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hauswald:2016:DFW,
  author =       "Johann Hauswald and Michael A. Laurenzano and Yunqi
                 Zhang and Hailong Yang and Yiping Kang and Cheng Li and
                 Austin Rovinski and Arjun Khurana and Ronald G.
                 Dreslinski and Trevor Mudge and Vinicius Petrucci and
                 Lingjia Tang and Jason Mars",
  title =        "Designing Future Warehouse-Scale Computers for
                 {Sirius}, an End-to-End Voice and Vision Personal
                 Assistant",
  journal =      j-TOCS,
  volume =       "34",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2870631",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "As user demand scales for intelligent personal
                 assistants (IPAs) such as Apple's Siri, Google's Google
                 Now, and Microsoft's Cortana, we are approaching the
                 computational limits of current datacenter (DC)
                 architectures. It is an open question how future server
                 architectures should evolve to enable this emerging
                 class of applications, and the lack of an open-source
                 IPA workload is an obstacle in addressing this
                 question. In this article, we present the design of
                 Sirius, an open end-to-end IPA Web-service application
                 that accepts queries in the form of voice and images,
                 and responds with natural language. We then use this
                 workload to investigate the implications of four points
                 in the design space of future accelerator-based server
                 architectures spanning traditional CPUs, GPUs, manycore
                 throughput co-processors, and FPGAs. To investigate
                 future server designs for Sirius, we decompose Sirius
                 into a suite of eight benchmarks (Sirius Suite)
                 comprising the computationally intensive bottlenecks of
                 Sirius. We port Sirius Suite to a spectrum of
                 accelerator platforms and use the performance and power
                 trade-offs across these platforms to perform a total
                 cost of ownership (TCO) analysis of various server
                 design points. In our study, we find that accelerators
                 are critical for the future scalability of IPA
                 services. Our results show that GPU- and
                 FPGA-accelerated servers improve the query latency on
                 average by 8.5$ \times $ and 15$ \times $,
                 respectively. For a given throughput, GPU- and
                 FPGA-accelerated servers can reduce the TCO of DCs by
                 2.3$ \times $ and 1.3$ \times $, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Badamo:2016:IPE,
  author =       "Michael Badamo and Jeff Casarona and Minshu Zhao and
                 Donald Yeung",
  title =        "Identifying Power-Efficient Multicore Cache
                 Hierarchies via Reuse Distance Analysis",
  journal =      j-TOCS,
  volume =       "34",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2851503",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "To enable performance improvements in a
                 power-efficient manner, computer architects have been
                 building CPUs that exploit greater amounts of
                 thread-level parallelism. A key consideration in such
                 CPUs is properly designing the on-chip cache hierarchy.
                 Unfortunately, this can be hard to do, especially for
                 CPUs with high core counts and large amounts of cache.
                 The enormous design space formed by the combinatorial
                 number of ways in which to organize the cache hierarchy
                 makes it difficult to identify power-efficient
                 configurations. Moreover, the problem is exacerbated by
                 the slow speed of architectural simulation, which is
                 the primary means for conducting such design space
                 studies. A powerful tool that can help architects
                 optimize CPU cache hierarchies is reuse distance (RD)
                 analysis. Recent work has extended uniprocessor RD
                 techniques-i.e., by introducing concurrent RD and
                 private-stack RD profiling-to enable analysis of
                 different types of caches in multicore CPUs. Once
                 acquired, parallel locality profiles can predict the
                 performance of numerous cache configurations,
                 permitting highly efficient design space exploration.
                 To date, existing work on multicore RD analysis has
                 focused on developing the profiling techniques and
                 assessing their accuracy. Unfortunately, there has been
                 no work on using RD analysis to optimize CPU
                 performance or power consumption. This article
                 investigates applying multicore RD analysis to identify
                 the most power efficient cache configurations for a
                 multicore CPU. First, we develop analytical models that
                 use the cache-miss counts from parallel locality
                 profiles to estimate CPU performance and power
                 consumption. Although future scalable CPUs will likely
                 employ multithreaded (and even out-of-order) cores, our
                 current study assumes single-threaded in-order cores to
                 simplify the models, allowing us to focus on the cache
                 hierarchy and our RD-based techniques. Second, to
                 demonstrate the utility of our techniques, we apply our
                 models to optimize a large-scale tiled CPU architecture
                 with a two-level cache hierarchy. We show that the most
                 power efficient configuration varies considerably
                 across different benchmarks, and that our locality
                 profiles provide deep insights into why certain
                 configurations are power efficient. We also show that
                 picking the best configuration can provide significant
                 gains, as there is a 2.01x power efficiency spread
                 across our tiled CPU design space. Finally, we validate
                 the accuracy of our techniques using detailed
                 simulation. Among several simulated configurations, our
                 techniques can usually pick the most power efficient
                 configuration, or one that is very close to the best.
                 In addition, across all simulated configurations, we
                 can predict power efficiency with 15.2\% error.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Perais:2016:ECS,
  author =       "Arthur Perais and Andr{\'e} Seznec",
  title =        "{EOLE}: Combining Static and Dynamic Scheduling
                 Through Value Prediction to Reduce Complexity and
                 Increase Performance",
  journal =      j-TOCS,
  volume =       "34",
  number =       "2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2870632",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Recent work in the field of value prediction (VP) has
                 shown that given an efficient confidence estimation
                 mechanism, prediction validation could be removed from
                 the out-of-order engine and delayed until commit time.
                 As a result, a simple recovery mechanism-pipeline
                 squashing-can be used, whereas the out-of-order engine
                 remains mostly unmodified. Yet, VP and validation at
                 commit time require additional ports on the physical
                 register file, potentially rendering the overall number
                 of ports unbearable. Fortunately, VP also implies that
                 many single-cycle ALU instructions have their operands
                 predicted in the front-end and can be executed
                 in-place, in-order. Similarly, the execution of
                 single-cycle instructions whose result has been
                 predicted can be delayed until commit time since
                 predictions are validated at commit time. Consequently,
                 a significant number of instructions-10\% to 70\% in
                 our experiments-can bypass the out-of-order engine,
                 allowing for a reduction of the issue width. This
                 reduction paves the way for a truly practical
                 implementation of VP. Furthermore, since VP in itself
                 usually increases performance, our resulting
                 {Early-Out-of-Order-Late} Execution architecture, EOLE,
                 is often more efficient than a baseline VP-augmented
                 6-issue superscalar while having a significantly
                 narrower 4-issue out-of-order engine.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Li:2016:FSA,
  author =       "Sheng Li and Hyeontaek Lim and Victor W. Lee and Jung
                 Ho Ahn and Anuj Kalia and Michael Kaminsky and David G.
                 Andersen and Seongil O. and Sukhan Lee and Pradeep
                 Dubey",
  title =        "Full-Stack Architecting to Achieve a
                 Billion-Requests-Per-Second Throughput on a Single
                 Key--Value Store Server Platform",
  journal =      j-TOCS,
  volume =       "34",
  number =       "2",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2897393",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Distributed in-memory key-value stores (KVSs), such as
                 memcached, have become a critical data serving layer in
                 modern Internet-oriented data center infrastructure.
                 Their performance and efficiency directly affect the
                 QoS of web services and the efficiency of data centers.
                 Traditionally, these systems have had significant
                 overheads from inefficient network processing, OS
                 kernel involvement, and concurrency control. Two recent
                 research thrusts have focused on improving key-value
                 performance. Hardware-centric research has started to
                 explore specialized platforms including FPGAs for KVSs;
                 results demonstrated an order of magnitude increase in
                 throughput and energy efficiency over stock memcached.
                 Software-centric research revisited the KVS application
                 to address fundamental software bottlenecks and to
                 exploit the full potential of modern commodity
                 hardware; these efforts also showed orders of magnitude
                 improvement over stock memcached. We aim at
                 architecting high-performance and efficient KVS
                 platforms, and start with a rigorous architectural
                 characterization across system stacks over a collection
                 of representative KVS implementations. Our detailed
                 full-system characterization not only identifies the
                 critical hardware/software ingredients for
                 high-performance KVS systems but also leads to guided
                 optimizations atop a recent design to achieve a
                 record-setting throughput of 120 million requests per
                 second (MRPS) (167MRPS with client-side batching) on a
                 single commodity server. Our system delivers the best
                 performance and energy efficiency (RPS/watt)
                 demonstrated to date over existing KVSs including the
                 best-published FPGA-based and GPU-based claims. We
                 craft a set of design principles for future platform
                 architectures, and via detailed simulations demonstrate
                 the capability of achieving a billion RPS with a single
                 server constructed following our principles.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Lo:2016:IRE,
  author =       "David Lo and Liqun Cheng and Rama Govindaraju and
                 Parthasarathy Ranganathan and Christos Kozyrakis",
  title =        "Improving Resource Efficiency at Scale with
                 {Heracles}",
  journal =      j-TOCS,
  volume =       "34",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2882783",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat May 21 08:09:53 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "User-facing, latency-sensitive services, such as
                 websearch, underutilize their computing resources
                 during daily periods of low traffic. Reusing those
                 resources for other tasks is rarely done in production
                 services since the contention for shared resources can
                 cause latency spikes that violate the service-level
                 objectives of latency-sensitive tasks. The resulting
                 under-utilization hurts both the affordability and
                 energy efficiency of large-scale datacenters. With the
                 slowdown in technology scaling caused by the sunsetting
                 of Moore's law, it becomes important to address this
                 opportunity. We present Heracles, a feedback-based
                 controller that enables the safe colocation of
                 best-effort tasks alongside a latency-critical service.
                 Heracles dynamically manages multiple hardware and
                 software isolation mechanisms, such as CPU, memory, and
                 network isolation, to ensure that the latency-sensitive
                 job meets latency targets while maximizing the
                 resources given to best-effort tasks. We evaluate
                 Heracles using production latency-critical and batch
                 workloads from Google and demonstrate average server
                 utilizations of 90\% without latency violations across
                 all the load and colocation scenarios that we
                 evaluated.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jun:2016:BDF,
  author =       "Sang-Woo Jun and Ming Liu and Sungjin Lee and Jamey
                 Hicks and John Ankcorn and Myron King and Shuotao Xu
                 and Arvind",
  title =        "{BlueDBM}: Distributed Flash Storage for Big Data
                 Analytics",
  journal =      j-TOCS,
  volume =       "34",
  number =       "3",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2898996",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 17 16:09:15 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Complex data queries, because of their need for random
                 accesses, have proven to be slow unless all the data
                 can be accommodated in DRAM. There are many domains,
                 such as genomics, geological data, and daily Twitter
                 feeds, where the datasets of interest are 5TB to 20TB.
                 For such a dataset, one would need a cluster with 100
                 servers, each with 128GB to 256GB of DRAM, to
                 accommodate all the data in DRAM. On the other hand,
                 such datasets could be stored easily in the flash
                 memory of a rack-sized cluster. Flash storage has much
                 better random access performance than hard disks, which
                 makes it desirable for analytics workloads. However,
                 currently available off-the-shelf flash storage
                 packaged as SSDs does not make effective use of flash
                 storage because it incurs a great amount of additional
                 overhead during flash device management and network
                 access. In this article, we present BlueDBM, a new
                 system architecture that has flash-based storage with
                 in-store processing capability and a low-latency
                 high-throughput intercontroller network between storage
                 devices. We show that BlueDBM outperforms a flash-based
                 system without these features by a factor of 10 for
                 some important applications. While the performance of a
                 DRAM-centric system falls sharply even if only 5\% to
                 10\% of the references are to secondary storage, this
                 sharp performance degradation is not an issue in
                 BlueDBM. BlueDBM presents an attractive point in the
                 cost/performance tradeoff for Big Data analytics.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{West:2016:VSK,
  author =       "Richard West and Ye Li and Eric Missimer and Matthew
                 Danish",
  title =        "A Virtualized Separation Kernel for Mixed-Criticality
                 Systems",
  journal =      j-TOCS,
  volume =       "34",
  number =       "3",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2935748",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 17 16:09:15 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Multi- and many-core processors are becoming
                 increasingly popular in embedded systems. Many of these
                 processors now feature hardware virtualization
                 capabilities, as found on the ARM Cortex A15 and x86
                 architectures with Intel VT-x or AMD-V support.
                 Hardware virtualization provides a way to partition
                 physical resources, including processor cores, memory,
                 and I/O devices, among guest virtual machines (VMs).
                 Each VM is then able to host tasks of a specific
                 criticality level, as part of a mixed-criticality
                 system with different timing and safety requirements.
                 However, traditional virtual machine systems are
                 inappropriate for mixed-criticality computing. They use
                 hypervisors to schedule separate VMs on physical
                 processor cores. The costs of trapping into hypervisors
                 to multiplex and manage machine physical resources on
                 behalf of separate guests are too expensive for many
                 time-critical tasks. Additionally, traditional
                 hypervisors have memory footprints that are often too
                 large for many embedded computing systems. In this
                 article, we discuss the design of the Quest-V
                 separation kernel, which partitions services of
                 different criticality levels across separate VMs, or
                 sandboxes. Each sandbox encapsulates a subset of
                 machine physical resources that it manages without
                 requiring intervention from a hypervisor. In Quest-V, a
                 hypervisor is only needed to bootstrap the system,
                 recover from certain faults, and establish
                 communication channels between sandboxes. This not only
                 reduces the memory footprint of the most privileged
                 protection domain but also removes it from the control
                 path during normal system operation, thereby
                 heightening security.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Silberstein:2016:GNA,
  author =       "Mark Silberstein and Sangman Kim and Seonggu Huh and
                 Xinya Zhang and Yige Hu and Amir Wated and Emmett
                 Witchel",
  title =        "{GPUnet}: Networking Abstractions for {GPU} Programs",
  journal =      j-TOCS,
  volume =       "34",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2016",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2963098",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 17 16:09:15 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Despite the popularity of GPUs in high-performance and
                 scientific computing, and despite increasingly
                 general-purpose hardware capabilities, the use of GPUs
                 in network servers or distributed systems poses
                 significant challenges. GPUnet is a native GPU
                 networking layer that provides a socket abstraction and
                 high-level networking APIs for GPU programs. We use
                 GPUnet to streamline the development of
                 high-performance, distributed applications like
                 in-GPU-memory MapReduce and a new class of low-latency,
                 high-throughput GPU-native network services such as a
                 face verification server.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zheng:2017:RAS,
  author =       "Mai Zheng and Joseph Tucek and Feng Qin and Mark
                 Lillibridge and Bill W. Zhao and Elizabeth S. Yang",
  title =        "Reliability Analysis of {SSDs} Under Power Fault",
  journal =      j-TOCS,
  volume =       "34",
  number =       "4",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2992782",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 24 09:40:46 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Modern storage technology (solid-state disks (SSDs),
                 NoSQL databases, commoditized RAID hardware, etc.)
                 brings new reliability challenges to the
                 already-complicated storage stack. Among other things,
                 the behavior of these new components during power
                 faults-which happen relatively frequently in data
                 centers-is an important yet mostly ignored issue in
                 this dependability-critical area. Understanding how new
                 storage components behave under power fault is the
                 first step towards designing new robust storage
                 systems. In this article, we propose a new methodology
                 to expose reliability issues in block devices under
                 power faults. Our framework includes specially designed
                 hardware to inject power faults directly to devices,
                 workloads to stress storage components, and techniques
                 to detect various types of failures. Applying our
                 testing framework, we test 17 commodity SSDs from six
                 different vendors using more than three thousand fault
                 injection cycles in total. Our experimental results
                 reveal that 14 of the 17 tested SSD devices exhibit
                 surprising failure behaviors under power faults,
                 including bit corruption, shorn writes, unserializable
                 writes, metadata corruption, and total device
                 failure.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Belay:2017:IOS,
  author =       "Adam Belay and George Prekas and Mia Primorac and Ana
                 Klimovic and Samuel Grossman and Christos Kozyrakis and
                 Edouard Bugnion",
  title =        "The {IX} Operating System: Combining Low Latency, High
                 Throughput, and Efficiency in a Protected Dataplane",
  journal =      j-TOCS,
  volume =       "34",
  number =       "4",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/2997641",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 24 09:40:46 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "See correction \cite{Belay:2017:CIO}.",
  abstract =     "The conventional wisdom is that aggressive networking
                 requirements, such as high packet rates for small
                 messages and $ \mu $ s-scale tail latency, are best
                 addressed outside the kernel, in a user-level
                 networking stack. We present ix, a dataplane operating
                 system that provides high I/O performance and high
                 resource efficiency while maintaining the protection
                 and isolation benefits of existing kernels. ix uses
                 hardware virtualization to separate management and
                 scheduling functions of the kernel (control plane) from
                 network processing (dataplane). The dataplane
                 architecture builds upon a native, zero-copy API and
                 optimizes for both bandwidth and latency by dedicating
                 hardware threads and networking queues to dataplane
                 instances, processing bounded batches of packets to
                 completion, and eliminating coherence traffic and
                 multicore synchronization. The control plane
                 dynamically adjusts core allocations and
                 voltage/frequency settings to meet service-level
                 objectives. We demonstrate that ix outperforms Linux
                 and a user-space network stack significantly in both
                 throughput and end-to-end latency. Moreover, ix
                 improves the throughput of a widely deployed, key-value
                 store by up to 6.$ 4 \times $ and reduces tail latency
                 by more than $ 2 \times $. With three varying load
                 patterns, the control plane saves 46\%--54\% of
                 processor energy, and it allows background jobs to run
                 at 35\%--47\% of their standalone throughput.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zahedi:2017:CSA,
  author =       "Seyed Majid Zahedi and Songchun Fan and Matthew Faw
                 and Elijah Cole and Benjamin C. Lee",
  title =        "Computational Sprinting: Architecture, Dynamics, and
                 Strategies",
  journal =      j-TOCS,
  volume =       "34",
  number =       "4",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3014428",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 24 09:40:46 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Computational sprinting is a class of mechanisms that
                 boost performance but dissipate additional power. We
                 describe a sprinting architecture in which many,
                 independent chip multiprocessors share a power supply
                 and sprints are constrained by the chips' thermal
                 limits and the rack's power limits. Moreover, we
                 present the computational sprinting game, a multi-agent
                 perspective on managing sprints. Strategic agents
                 decide whether to sprint based on application phases
                 and system conditions. The game produces an equilibrium
                 that improves task throughput for data analytics
                 workloads by 4--6$ \times $ over prior greedy
                 heuristics and performs within 90\% of an upper bound
                 on throughput from a globally optimized policy.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zhu:2017:OGP,
  author =       "Yuhao Zhu and Vijay Janapa Reddi",
  title =        "Optimizing General-Purpose {CPUs} for Energy-Efficient
                 Mobile {Web} Computing",
  journal =      j-TOCS,
  volume =       "35",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jul,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3041024",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 24 09:40:47 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Mobile applications are increasingly being built using
                 web technologies as a common substrate to achieve
                 portability and to improve developer productivity.
                 Unfortunately, web applications often incur large
                 performance overhead, directly affecting the user
                 quality-of-service (QoS) experience. Traditional
                 techniques in improving mobile processor performance
                 have mostly been adopting desktop-like design
                 techniques such as increasing single-core
                 microarchitecture complexity and aggressively
                 integrating more cores. However, such a
                 desktop-oriented strategy is likely coming to an end
                 due to the stringent energy and thermal constraints
                 that mobile devices impose. Therefore, we must pivot
                 away from traditional mobile processor design
                 techniques in order to provide sustainable performance
                 improvement while maintaining energy efficiency. In
                 this article, we propose to combine hardware
                 customization and specialization techniques to improve
                 the performance and energy efficiency of mobile web
                 applications. We first perform design-space exploration
                 (DSE) and identify opportunities in customizing
                 existing general-purpose mobile processors, that is,
                 tuning microarchitecture parameters. The thorough DSE
                 also lets us discover sources of energy inefficiency in
                 customized general-purpose architectures. To mitigate
                 these inefficiencies, we propose, synthesize, and
                 evaluate two new domain-specific specializations,
                 called the Style Resolution Unit and the Browser Engine
                 Cache. Our optimizations boost performance and energy
                 efficiency at the same time while maintaining
                 general-purpose programmability. As emerging mobile
                 workloads increasingly rely more on web technologies,
                 the type of optimizations we propose will become
                 important in the future and are likely to have a
                 long-lasting and widespread impact.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hsu:2017:RLT,
  author =       "Chang-Hong Hsu and Yunqi Zhang and Michael A.
                 Laurenzano and David Meisner and Thomas Wenisch and
                 Ronald G. Dreslinski and Jason Mars and Lingjia Tang",
  title =        "Reining in Long Tails in Warehouse-Scale Computers
                 with Quick Voltage Boosting Using Adrenaline",
  journal =      j-TOCS,
  volume =       "35",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jul,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3054742",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 24 09:40:47 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Reducing the long tail of the query latency
                 distribution in modern warehouse scale computers is
                 critical for improving performance and quality of
                 service (QoS) of workloads such as Web Search and
                 Memcached. Traditional turbo boost increases a
                 processor's voltage and frequency during a
                 coarse-grained sliding window, boosting all queries
                 that are processed during that window. However, the
                 inability of such a technique to pinpoint tail queries
                 for boosting limits its tail reduction benefit. In this
                 work, we propose Adrenaline, an approach to leverage
                 finer-granularity (tens of nanoseconds) voltage
                 boosting to effectively rein in the tail latency with
                 query-level precision. Two key insights underlie this
                 work. First, emerging finer granularity
                 voltage/frequency boosting is an enabling mechanism for
                 intelligent allocation of the power budget to precisely
                 boost only the queries that contribute to the tail
                 latency; second, per-query characteristics can be used
                 to design indicators for proactively pinpointing these
                 queries, triggering boosting accordingly. Based on
                 these insights, Adrenaline effectively pinpoints and
                 boosts queries that are likely to increase the tail
                 distribution and can reap more benefit from the
                 voltage/frequency boost. By evaluating under various
                 workload configurations, we demonstrate the
                 effectiveness of our methodology. We achieve up to a
                 2.50 $ \times $ tail latency improvement for Memcached
                 and up to a 3.03 $ \times $ for Web Search over
                 coarse-grained dynamic voltage and frequency scaling
                 (DVFS) given a fixed boosting power budget. When
                 optimizing for energy reduction, Adrenaline achieves up
                 to a 1.81 $ \times $ improvement for Memcached and up
                 to a 1.99 $ \times $ for Web Search over coarse-grained
                 DVFS. By using the carefully chosen boost thresholds,
                 Adrenaline further improves the tail latency reduction
                 to 4.82 $ \times $ over coarse-grained DVFS.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Chen:2017:FMT,
  author =       "Haibo Chen and Rong Chen and Xingda Wei and Jiaxin Shi
                 and Yanzhe Chen and Zhaoguo Wang and Binyu Zang and
                 Haibing Guan",
  title =        "Fast In-Memory Transaction Processing Using {RDMA} and
                 {HTM}",
  journal =      j-TOCS,
  volume =       "35",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jul,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3092701",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Jul 24 09:40:47 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "DrTM is a fast in-memory transaction processing system
                 that exploits advanced hardware features such as remote
                 direct memory access (RDMA) and hardware transactional
                 memory (HTM). To achieve high efficiency, it mostly
                 offloads concurrency control such as tracking
                 read/write accesses and conflict detection into HTM in
                 a local machine and leverages the strong consistency
                 between RDMA and HTM to ensure serializability among
                 concurrent transactions across machines. To mitigate
                 the high probability of HTM aborts for large
                 transactions, we design and implement an optimized
                 transaction chopping algorithm to decompose a set of
                 large transactions into smaller pieces such that HTM is
                 only required to protect each piece. We further build
                 an efficient hash table for DrTM by leveraging HTM and
                 RDMA to simplify the design and notably improve the
                 performance. We describe how DrTM supports common
                 database features like read-only transactions and
                 logging for durability. Evaluation using typical OLTP
                 workloads including TPC-C and SmallBank shows that DrTM
                 has better single-node efficiency and scales well on a
                 six-node cluster; it achieves greater than 1.51, 34 and
                 5.24, 138 million transactions per second for TPC-C and
                 SmallBank on a single node and the cluster,
                 respectively. Such numbers outperform a
                 state-of-the-art single-node system (i.e., Silo) and a
                 distributed transaction system (i.e., Calvin) by at
                 least 1.9X and 29.6X for TPC-C.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zhao:2017:UMR,
  author =       "Minshu Zhao and Donald Yeung",
  title =        "Using Multicore Reuse Distance to Study Coherence
                 Directories",
  journal =      j-TOCS,
  volume =       "35",
  number =       "2",
  pages =        "4:1--4:??",
  month =        oct,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3092702",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Oct 10 17:48:24 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Researchers have proposed numerous techniques to
                 improve the scalability of coherence directories. The
                 effectiveness of these techniques not only depends on
                 application behavior, but also on the CPU's
                 configuration, for example, its core count and cache
                 size. As CPUs continue to scale, it is essential to
                 explore the directory's application and architecture
                 dependencies. However, this is challenging given the
                 slow speed of simulators. While it is common practice
                 to simulate different applications, previous research
                 on directory designs have explored only a few-and in
                 most cases, only one-CPU configuration, which can lead
                 to an incomplete and inaccurate view of the directory's
                 behavior. This article proposes to use multicore reuse
                 distance analysis to study coherence directories. We
                 develop a framework to extract the directory access
                 stream from parallel least recently used (LRU) stacks,
                 enabling rapid analysis of the directory's accesses and
                 contents across both core count and cache size scaling.
                 A key part of our framework is the notion of relative
                 reuse distance between sharers, which defines sharing
                 in a capacity-dependent fashion and facilitates our
                 analyses along the data cache size dimension. We
                 implement our framework in a profiler and then apply it
                 to gain insights into the impact of multicore CPU
                 scaling on directory behavior. Our profiling results
                 show that directory accesses reduce by 3.3$ \times $
                 when scaling the data cache size from 16KB to 1MB,
                 despite an increase in sharing-based directory
                 accesses. We also show that increased sharing caused by
                 data cache scaling allows the portion of on-chip memory
                 occupied by the directory to be reduced by 43.3\%,
                 compared to a reduction of only 2.6\% when scaling the
                 number of cores. And, we show certain directory entries
                 exhibit high temporal reuse. In addition to gaining
                 insights, we also validate our profile-based results,
                 and find they are within 2--10\% of cache simulations
                 on average, across different validation experiments.
                 Finally, we conduct four case studies that illustrate
                 our insights on existing directory techniques. In
                 particular, we demonstrate our directory occupancy
                 insights on a Cuckoo directory; we apply our sharing
                 insights to provide bounds on the size of Scalable
                 Coherence Directories (SCD) and Dual-Grain Directories
                 (DGD); and, we demonstrate our directory entry reuse
                 insights on a multilevel directory design.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Chun:2017:ARR,
  author =       "Byung-Gon Chun and Tyson Condie and Yingda Chen and
                 Brian Cho and Andrew Chung and Carlo Curino and Chris
                 Douglas and Matteo Interlandi and Beomyeol Jeon and Joo
                 Seong Jeong and Gyewon Lee and Yunseong Lee and Tony
                 Majestro and Dahlia Malkhi and Sergiy Matusevych and
                 Brandon Myers and Mariia Mykhailova and Shravan
                 Narayanamurthy and Joseph Noor and Raghu Ramakrishnan
                 and Sriram Rao and Russell Sears and Beysim Sezgin and
                 Taegeon Um and Julia Wang and Markus Weimer and
                 Youngseok Yang",
  title =        "{Apache REEF}: Retainable Evaluator Execution
                 Framework",
  journal =      j-TOCS,
  volume =       "35",
  number =       "2",
  pages =        "5:1--5:??",
  month =        oct,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3132037",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Oct 10 17:48:24 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Resource Managers like YARN and Mesos have emerged as
                 a critical layer in the cloud computing system stack,
                 but the developer abstractions for leasing cluster
                 resources and instantiating application logic are very
                 low level. This flexibility comes at a high cost in
                 terms of developer effort, as each application must
                 repeatedly tackle the same challenges (e.g., fault
                 tolerance, task scheduling and coordination) and
                 reimplement common mechanisms (e.g., caching, bulk-data
                 transfers). This article presents REEF, a development
                 framework that provides a control plane for scheduling
                 and coordinating task-level (data-plane) work on
                 cluster resources obtained from a Resource Manager.
                 REEF provides mechanisms that facilitate resource reuse
                 for data caching and state management abstractions that
                 greatly ease the development of elastic data processing
                 pipelines on cloud platforms that support a Resource
                 Manager service. We illustrate the power of REEF by
                 showing applications built atop: a distributed shell
                 application, a machine-learning framework, a
                 distributed in-memory caching system, and a port of the
                 CORFU system. REEF is currently an Apache top-level
                 project that has attracted contributors from several
                 institutions and it is being used to develop several
                 commercial offerings such as the Azure Stream Analytics
                 service.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Shen:2017:SLC,
  author =       "Zhiming Shen and Qin Jia and Gur-Eyal Sela and Weijia
                 Song and Hakim Weatherspoon and Robbert {Van Renesse}",
  title =        "{Supercloud}: a Library Cloud for Exploiting Cloud
                 Diversity",
  journal =      j-TOCS,
  volume =       "35",
  number =       "2",
  pages =        "6:1--6:??",
  month =        oct,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3132038",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Oct 10 17:48:24 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tocs/;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Infrastructure-as-a-Service (IaaS) cloud providers
                 hide available interfaces for virtual machine (VM)
                 placement and migration, CPU capping, memory
                 ballooning, page sharing, and I/O throttling, limiting
                 the ways in which applications can optimally configure
                 resources or respond to dynamically shifting workloads.
                 Given these interfaces, applications could migrate VMs
                 in response to diurnal workloads or changing prices,
                 adjust resources in response to load changes, and so
                 on. This article proposes a new abstraction that we
                 call a Library Cloud and that allows users to customize
                 the diverse available cloud resources to best serve
                 their applications. We built a prototype of a Library
                 Cloud that we call the Supercloud. The Supercloud
                 encapsulates applications in a virtual cloud under
                 users' full control and can incorporate one or more
                 availability zones within a cloud provider or across
                 different providers. The Supercloud provides virtual
                 machine, storage, and networking complete with a full
                 set of management operations, allowing applications to
                 optimize performance. In this article, we demonstrate
                 various innovations enabled by the Library Cloud.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Diegues:2017:SPS,
  author =       "Nuno Diegues and Paolo Romano and Stoyan Garbatov",
  title =        "{Seer}: Probabilistic Scheduling for Hardware
                 Transactional Memory",
  journal =      j-TOCS,
  volume =       "35",
  number =       "3",
  pages =        "7:1--7:41",
  month =        dec,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3132036",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Dec 27 09:34:24 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "The ubiquity of multicore processors has led
                 programmers to write parallel and concurrent
                 applications to take advantage of the underlying
                 hardware and speed up their executions. In this
                 context, Transactional Memory (TM) has emerged as a
                 simple and effective synchronization paradigm, via the
                 familiar abstraction of atomic transactions. After many
                 years of intense research, major processor
                 manufacturers (including Intel) have recently released
                 mainstream processors with hardware support for TM
                 (HTM). In this work, we study a relevant issue with
                 great impact on the performance of HTM. Due to the
                 optimistic and inherently limited nature of HTM,
                 transactions may have to be aborted and restarted
                 numerous times, without any progress guarantee. As a
                 result, it is up to the software library that regulates
                 the HTM usage to ensure progress and optimize
                 performance. Transaction scheduling is probably one of
                 the most well-studied and effective techniques to
                 achieve these goals. However, these recent mainstream
                 HTMs have some technical limitations that prevent the
                 adoption of known scheduling techniques: unlike
                 software implementations of TM used in the past,
                 existing HTMs provide limited or no information on
                 which memory regions or contending transactions caused
                 the abort. To address this crucial issue for HTMs, we
                 propose Seer, a software scheduler that addresses
                 precisely this restriction of HTM by leveraging on an
                 online probabilistic inference technique that
                 identifies the most likely conflict relations and
                 establishes a dynamic locking scheme to serialize
                 transactions in a fine-grained manner. The key idea of
                 our solution is to constrain the portions of
                 parallelism that are affecting negatively the whole
                 system. As a result, this not only prevents performance
                 reduction but also in fact unveils further scalability
                 and performance for HTM. Via an extensive evaluation
                 study, we show that Seer improves the performance of
                 the Intel's HTM by up to 3.6$ \times $, and by 65\% on
                 average across all concurrency degrees and benchmarks
                 on a large processor with 28 cores.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Nishtala:2017:HAI,
  author =       "Rajiv Nishtala and Paul Carpenter and Vinicius
                 Petrucci and Xavier Martorell",
  title =        "The {Hipster} Approach for Improving Cloud System
                 Efficiency",
  journal =      j-TOCS,
  volume =       "35",
  number =       "3",
  pages =        "8:1--8:28",
  month =        dec,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3144168",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Dec 27 09:34:24 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "In 2013, U.S. data centers accounted for 2.2\% of the
                 country's total electricity consumption, a figure that
                 is projected to increase rapidly over the next decade.
                 Many important data center workloads in cloud computing
                 are interactive, and they demand strict levels of
                 quality-of-service (QoS) to meet user expectations,
                 making it challenging to optimize power consumption
                 along with increasing performance demands. This article
                 introduces Hipster, a technique that combines
                 heuristics and reinforcement learning to improve
                 resource efficiency in cloud systems. Hipster explores
                 heterogeneous multi-cores and dynamic voltage and
                 frequency scaling for reducing energy consumption while
                 managing the QoS of the latency-critical workloads. To
                 improve data center utilization and make best usage of
                 the available resources, Hipster can dynamically assign
                 remaining cores to batch workloads without violating
                 the QoS constraints for the latency-critical workloads.
                 We perform experiments using a 64-bit ARM big.LITTLE
                 platform and show that, compared to prior work, Hipster
                 improves the QoS guarantee for Web-Search from 80\% to
                 96\%, and for Memcached from 92\% to 99\%, while
                 reducing the energy consumption by up to 18\%. Hipster
                 is also effective in learning and adapting
                 automatically to specific requirements of new incoming
                 workloads just enough to meet the QoS and optimize
                 resource consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Cherupalli:2017:DAS,
  author =       "Hari Cherupalli and Henry Duwe and Weidong Ye and
                 Rakesh Kumar and John Sartori",
  title =        "Determining Application-Specific Peak Power and Energy
                 Requirements for Ultra-Low-Power Processors",
  journal =      j-TOCS,
  volume =       "35",
  number =       "3",
  pages =        "9:1--9:33",
  month =        dec,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3148052",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Wed Dec 27 09:34:24 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  abstract =     "Many emerging applications such as the Internet of
                 Things, wearables, implantables, and sensor networks
                 are constrained by power and energy. These applications
                 rely on ultra-low-power processors that have rapidly
                 become the most abundant type of processor manufactured
                 today. In the ultra-low-power embedded systems used by
                 these applications, peak power and energy requirements
                 are the primary factors that determine critical system
                 characteristics, such as size, weight, cost, and
                 lifetime. While the power and energy requirements of
                 these systems tend to be application specific,
                 conventional techniques for rating peak power and
                 energy cannot accurately bound the power and energy
                 requirements of an application running on a processor,
                 leading to overprovisioning that increases system size
                 and weight. In this article, we present an automated
                 technique that performs hardware-software coanalysis of
                 the application and ultra-low-power processor in an
                 embedded system to determine application-specific peak
                 power and energy requirements. Our technique provides
                 more accurate, tighter bounds than conventional
                 techniques for determining peak power and energy
                 requirements. Also, unlike conventional approaches, our
                 technique reports guaranteed bounds on peak power and
                 energy independent of an application's input set.
                 Tighter bounds on peak power and energy can be
                 exploited to reduce system size, weight, and cost.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Belay:2017:CIO,
  author =       "Adam Belay and George Prekas and Mia Primorac and Ana
                 Klimovic and Samuel Grossman and Christos Kozyrakis and
                 Edouard Bugnion",
  title =        "Corrigendum to {``The IX Operating System: Combining
                 Low Latency, High Throughput and Efficiency in a
                 Protected Dataplane''}",
  journal =      j-TOCS,
  volume =       "35",
  number =       "3",
  pages =        "10:1--10:1",
  month =        dec,
  year =         "2017",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3154292",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Dec 29 17:57:41 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "See \cite{Belay:2017:IOS}.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Mace:2018:PTD,
  author =       "Jonathan Mace and Ryan Roelke and Rodrigo Fonseca",
  title =        "Pivot Tracing: Dynamic Causal Monitoring for
                 Distributed Systems",
  journal =      j-TOCS,
  volume =       "35",
  number =       "4",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2018",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3208104",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:29 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3208104",
  abstract =     "Monitoring and troubleshooting distributed systems is
                 notoriously difficult; potential problems are complex,
                 varied, and unpredictable. The monitoring and diagnosis
                 tools commonly used today-logs, counters, and
                 metrics-have two important limitations: what gets
                 recorded is defined a priori, and the information is
                 recorded in a component- or machine-centric way, making
                 it extremely hard to correlate events that cross these
                 boundaries. This article presents Pivot Tracing, a
                 monitoring framework for distributed systems that
                 addresses both limitations by combining dynamic
                 instrumentation with a novel relational operator: the
                 happened-before join. Pivot Tracing gives users, at
                 runtime, the ability to define arbitrary metrics at one
                 point of the system, while being able to select,
                 filter, and group by events meaningful at other parts
                 of the system, even when crossing component or machine
                 boundaries. We have implemented a prototype of Pivot
                 Tracing for Java-based systems and evaluate it on a
                 heterogeneous Hadoop cluster comprising HDFS, HBase,
                 MapReduce, and YARN. We show that Pivot Tracing can
                 effectively identify a diverse range of root causes
                 such as software bugs, misconfiguration, and limping
                 hardware. We show that Pivot Tracing is dynamic,
                 extensible, and enables cross-tier analysis between
                 inter-operating applications, with low execution
                 overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zhang:2018:BCT,
  author =       "Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres
                 and Arvind Krishnamurthy and Dan R. K. Ports",
  title =        "Building Consistent Transactions with Inconsistent
                 Replication",
  journal =      j-TOCS,
  volume =       "35",
  number =       "4",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2018",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3269981",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:29 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3269981",
  abstract =     "Application programmers increasingly prefer
                 distributed storage systems with strong consistency and
                 distributed transactions (e.g., Google's Spanner) for
                 their strong guarantees and ease of use. Unfortunately,
                 existing transactional storage systems are expensive to
                 use-in part, because they require costly replication
                 protocols, like Paxos, for fault tolerance. In this
                 article, we present a new approach that makes
                 transactional storage systems more affordable: We
                 eliminate consistency from the replication protocol,
                 while still providing distributed transactions with
                 strong consistency to applications. We present the
                 Transactional Application Protocol for Inconsistent
                 Replication (TAPIR), the first transaction protocol to
                 use a novel replication protocol, called inconsistent
                 replication, that provides fault tolerance without
                 consistency. By enforcing strong consistency only in
                 the transaction protocol, TAPIR can commit transactions
                 in a single round-trip and order distributed
                 transactions without centralized coordination. We
                 demonstrate the use of TAPIR in a transactional
                 key-value store, TAPIR-KV. Compared to conventional
                 systems, TAPIR-KV provides better latency and better
                 throughput.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Hunt:2018:RDS,
  author =       "Tyler Hunt and Zhiting Zhu and Yuanzhong Xu and Simon
                 Peter and Emmett Witchel",
  title =        "{Ryoan}: a Distributed Sandbox for Untrusted
                 Computation on Secret Data",
  journal =      j-TOCS,
  volume =       "35",
  number =       "4",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2018",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3231594",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:29 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231594",
  abstract =     "Users of modern data-processing services such as tax
                 preparation or genomic screening are forced to trust
                 them with data that the users wish to keep secret.
                 Ryoan$^1$ protects secret data while it is processed by
                 services that the data owner does not trust.
                 Accomplishing this goal in a distributed setting is
                 difficult, because the user has no control over the
                 service providers or the computational platform.
                 Confining code to prevent it from leaking secrets is
                 notoriously difficult, but Ryoan benefits from new
                 hardware and a request-oriented data model. Ryoan
                 provides a distributed sandbox, leveraging hardware
                 enclaves (e.g., Intel's software guard extensions (SGX)
                 [40]) to protect sandbox instances from potentially
                 malicious computing platforms. The protected sandbox
                 instances confine untrusted data-processing modules to
                 prevent leakage of the user's input data. Ryoan is
                 designed for a request-oriented data model, where
                 confined modules only process input once and do not
                 persist state about the input. We present the design
                 and prototype implementation of Ryoan and evaluate it
                 on a series of challenging problems including email
                 filtering, health analysis, image processing and
                 machine translation.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Guerraoui:2019:LUA,
  author =       "Rachid Guerraoui and Hugo Guiroux and Renaud Lachaize
                 and Vivien Qu{\'e}ma and Vasileios Trigonakis",
  title =        "Lock--Unlock: Is That All? {A} Pragmatic Analysis of
                 Locking in Software Systems",
  journal =      j-TOCS,
  volume =       "36",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3301501",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:29 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301501",
  abstract =     "A plethora of optimized mutex lock algorithms have
                 been designed over the past 25 years to mitigate
                 performance bottlenecks related to critical sections
                 and locks. Unfortunately, there is currently no broad
                 study of the behavior of these optimized lock
                 algorithms on realistic applications that consider
                 different performance metrics, such as energy
                 efficiency and tail latency. In this article, we
                 perform a thorough and practical analysis of
                 synchronization, with the goal of providing software
                 developers with enough information to design fast,
                 scalable, and energy-efficient synchronization in their
                 systems. First, we perform a performance study of 28
                 state-of-the-art mutex lock algorithms, on 40
                 applications, on four different multicore machines. We
                 consider not only throughput (traditionally the main
                 performance metric) but also energy efficiency and tail
                 latency, which are becoming increasingly important.
                 Second, we present an in-depth analysis in which we
                 summarize our findings for all the studied
                 applications. In particular, we describe nine different
                 lock-related performance bottlenecks, and we propose
                 six guidelines helping software developers with their
                 choice of a lock algorithm according to the different
                 lock properties and the application characteristics.
                 From our detailed analysis, we make several
                 observations regarding locking algorithms and
                 application behaviors, several of which have not been
                 previously discovered: (i) applications stress not only
                 the lock-unlock interface but also the full locking API
                 (e.g., trylocks, condition variables); (ii) the memory
                 footprint of a lock can directly affect the application
                 performance; (iii) for many applications, the
                 interaction between locks and scheduling is an
                 important application performance factor; (vi) lock
                 tail latencies may or may not affect application tail
                 latency; (v) no single lock is systematically the best;
                 (vi) choosing the best lock is difficult; and (vii)
                 energy efficiency and throughput go hand in hand in the
                 context of lock algorithms. These findings highlight
                 that locking involves more considerations than the
                 simple lock/unlock interface and call for further
                 research on designing low-memory footprint adaptive
                 locks that fully and efficiently support the full lock
                 interface, and consider all performance metrics.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Zhao:2019:VER,
  author =       "Boyan Zhao and Rui Hou and Jianbo Dong and Michael
                 Huang and Sally A. Mckee and Qianlong Zhang and Yueji
                 Liu and Ye Li and Lixin Zhang and Dan Meng",
  title =        "{Venice}: an Effective Resource Sharing Architecture
                 for Data Center Servers",
  journal =      j-TOCS,
  volume =       "36",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3310360",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:29 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310360",
  abstract =     "Consolidated server racks are quickly becoming the
                 standard infrastructure for engineering, business,
                 medicine, and science. Such servers are still designed
                 much in the way when they were organized as individual,
                 distributed systems. Given that many fields rely on
                 big-data analytics substantially, its
                 cost-effectiveness and performance should be improved,
                 which can be achieved by flexibly allowing resources to
                 be shared across nodes. Here we describe Venice, a
                 family of data-center server architectures that
                 includes a strong communication substrate as a
                 first-class resource. Venice supports a diverse set of
                 resource-joining mechanisms that enables applications
                 to leverage non-local resources efficiently. We have
                 constructed a hardware prototype to better understand
                 the implications of design decisions about system
                 support for resource sharing. We use it to measure the
                 performance of at-scale applications and to explore
                 performance, power, and resource-sharing transparency
                 tradeoffs (i.e., how many programming changes are
                 needed). We analyze these tradeoffs for sharing memory,
                 accelerators, and NICs. We find that reducing/hiding
                 latency is particularly important, the chosen
                 communication channels should match the sharing access
                 patterns of the applications, and of which we can
                 improve performance by exploiting inter-channel
                 collaboration.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Shi:2019:DGC,
  author =       "Xuanhua Shi and Zhixiang Ke and Yongluan Zhou and Hai
                 Jin and Lu Lu and Xiong Zhang and Ligang He and Zhenyu
                 Hu and Fei Wang",
  title =        "{Deca}: a Garbage Collection Optimizer for In-Memory
                 Data Processing",
  journal =      j-TOCS,
  volume =       "36",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3310361",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:29 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310361",
  abstract =     "In-memory caching of intermediate data and active
                 combining of data in shuffle buffers have been shown to
                 be very effective in minimizing the recomputation and
                 I/O cost in big data processing systems such as Spark
                 and Flink. However, it has also been widely reported
                 that these techniques would create a large amount of
                 long-living data objects in the heap. These generated
                 objects may quickly saturate the garbage collector,
                 especially when handling a large dataset, and hence,
                 limit the scalability of the system. To eliminate this
                 problem, we propose a lifetime-based memory management
                 framework, which, by automatically analyzing the
                 user-defined functions and data types, obtains the
                 expected lifetime of the data objects and then
                 allocates and releases memory space accordingly to
                 minimize the garbage collection overhead. In
                 particular, we present Deca$^1$ a concrete
                 implementation of our proposal on top of Spark, which
                 transparently decomposes and groups objects with
                 similar lifetimes into byte arrays and releases their
                 space altogether when their lifetimes come to an end.
                 When systems are processing very large data, Deca also
                 provides field-oriented memory pages to ensure high
                 compression efficiency. Extensive experimental studies
                 using both synthetic and real datasets show that, in
                 comparing to Spark, Deca is able to (1) reduce the
                 garbage collection time by up to 99.9\%, (2) reduce the
                 memory consumption by up to 46.6\% and the storage
                 space by 23.4\%, (3) achieve 1.2$ \times $ to 22.7$
                 \times $ speedup in terms of execution time in cases
                 without data spilling and 16$ \times $ to 41.6$ \times
                 $ speedup in cases with data spilling, and (4) provide
                 similar performance compared to domain-specific
                 systems.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Jha:2019:DFS,
  author =       "Sagar Jha and Jonathan Behrens and Theo Gkountouvas
                 and Matthew Milano and Weijia Song and Edward Tremel
                 and Robbert {Van Renesse} and Sydney Zink and Kenneth
                 P. Birman",
  title =        "{Derecho}: Fast State Machine Replication for Cloud
                 Services",
  journal =      j-TOCS,
  volume =       "36",
  number =       "2",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3302258",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:30 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "See corrigendum \cite{Jha:2020:CDF}.",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3302258",
  abstract =     "Cloud computing services often replicate data and may
                 require ways to coordinate distributed actions. Here we
                 present Derecho, a library for such tasks. The API
                 provides interfaces for structuring applications into
                 patterns of subgroups and shards, supports state
                 machine replication within them, and includes
                 mechanisms that assist in restart after failures.
                 Running over 100Gbps RDMA, Derecho can send millions of
                 events per second in each subgroup or shard and
                 throughput peaks at 16GB/s, substantially outperforming
                 prior solutions. Configured to run purely on TCP,
                 Derecho is still substantially faster than comparable
                 widely used, highly-tuned, standard tools. The key
                 insight is that on modern hardware (including non-RDMA
                 networks), data-intensive protocols should be built
                 from non-blocking data-flow components.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bergman:2019:SSO,
  author =       "Shai Bergman and Tanya Brokhman and Tzachi Cohen and
                 Mark Silberstein",
  title =        "{SPIN}: Seamless Operating System Integration of
                 Peer-to-Peer {DMA} Between {SSDs} and {GPUs}",
  journal =      j-TOCS,
  volume =       "36",
  number =       "2",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3309987",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:30 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309987",
  abstract =     "Recent GPUs enable Peer-to-Peer Direct Memory Access (
                 p2p) from fast peripheral devices like NVMe SSDs to
                 exclude the CPU from the data path between them for
                 efficiency. Unfortunately, using p2p to access files is
                 challenging because of the subtleties of low-level
                 non-standard interfaces, which bypass the OS file I/O
                 layers and may hurt system performance. Developers must
                 possess intimate knowledge of low-level interfaces to
                 manually handle the subtleties of data consistency and
                 misaligned accesses. We present SPIN, which integrates
                 p2p into the standard OS file I/O stack, dynamically
                 activating p2p where appropriate, transparently to the
                 user. It combines p2p with page cache accesses,
                 re-enables read-ahead for sequential reads, all while
                 maintaining standard POSIX FS consistency, portability
                 across GPUs and SSDs, and compatibility with virtual
                 block devices such as software RAID. We evaluate SPIN
                 on NVIDIA and AMD GPUs using standard file I/O
                 benchmarks, application traces, and end-to-end
                 experiments. SPIN achieves significant performance
                 speedups across a wide range of workloads, exceeding
                 p2p throughput by up to an order of magnitude. It also
                 boosts the performance of an aerial imagery rendering
                 application by 2.6$ \times $ by dynamically adapting to
                 its input-dependent file access pattern, enables 3.3$
                 \times $ higher throughput for a GPU-accelerated log
                 server, and enables 29\% faster execution for the
                 highly optimized GPU-accelerated image collage with
                 only 30 changed lines of code.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Novakovic:2019:MLI,
  author =       "Stanko Novakovic and Alexandros Daglis and Dmitrii
                 Ustiugov and Edouard Bugnion and Babak Falsafi and
                 Boris Grot",
  title =        "Mitigating Load Imbalance in Distributed Data Serving
                 with Rack-Scale Memory Pooling",
  journal =      j-TOCS,
  volume =       "36",
  number =       "2",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3309986",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:30 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309986",
  abstract =     "To provide low-latency and high-throughput guarantees,
                 most large key-value stores keep the data in the memory
                 of many servers. Despite the natural parallelism across
                 lookups, the load imbalance, introduced by heavy skew
                 in the popularity distribution of keys, limits
                 performance. To avoid violating tail latency
                 service-level objectives, systems tend to keep server
                 utilization low and organize the data in micro-shards,
                 which provides units of migration and replication for
                 the purpose of load balancing. These techniques reduce
                 the skew but incur additional monitoring, data
                 replication, and consistency maintenance overheads. In
                 this work, we introduce RackOut, a memory pooling
                 technique that leverages the one-sided remote read
                 primitive of emerging rack-scale systems to mitigate
                 load imbalance while respecting service-level
                 objectives. In RackOut, the data are aggregated at
                 rack-scale granularity, with all of the participating
                 servers in the rack jointly servicing all of the rack's
                 micro-shards. We develop a queuing model to evaluate
                 the impact of RackOut at the datacenter scale. In
                 addition, we implement a RackOut proof-of-concept
                 key-value store, evaluate it on two experimental
                 platforms based on RDMA and Scale-Out NUMA, and use
                 these results to validate the model. We devise two
                 distinct approaches to load balancing within a RackOut
                 unit, one based on random selection of nodes ---
                 RackOut\_static --- and another one based on an
                 adaptive load balancing mechanism-RackOut\_adaptive.
                 Our results show that RackOut\_static increases
                 throughput by up to 6$ \times $ for RDMA and 8.6$
                 \times $ for Scale-Out NUMA compared to a scale-out
                 deployment, while respecting tight tail latency
                 service-level objectives. RackOut\_adaptive improves
                 the throughput by 30\% for workloads with 20\% of
                 writes over RackOut\_static.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Iturbe:2019:ATC,
  author =       "Xabier Iturbe and Balaji Venu and Emre Ozer and
                 Jean-Luc Poupat and Gregoire Gimenez and Hans-Ulrich
                 Zurek",
  title =        "The {Arm Triple Core Lock-Step (TCLS)} Processor",
  journal =      j-TOCS,
  volume =       "36",
  number =       "3",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3323917",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:30 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3323917",
  abstract =     "The Arm Triple Core Lock-Step (TCLS) architecture is
                 the natural evolution of Arm Cortex-R Dual Core
                 Lock-Step (DCLS) processors to increase dependability,
                 predictability, and availability in safety-critical and
                 ultra-reliable applications. TCLS is simple, scalable,
                 and easy to deploy in applications where Arm DCLS
                 processors are widely used (e.g., automotive), as well
                 as in new sectors where the presence of Arm technology
                 is incipient (e.g., enterprise) or almost non-existent
                 (e.g., space). Specifically in space, COTS Arm
                 processors provide optimal power-to-performance,
                 extensibility, evolvability, software availability, and
                 ease of use, especially in comparison with the decades
                 old rad-hard computing solutions that are still in use.
                 This article discusses the fundamentals of an Arm
                 Cortex-R5 based TCLS processor, providing key
                 functioning and implementation details. The article
                 shows that the TCLS architecture keeps the use of
                 rad-hard technology to a minimum, namely, using
                 rad-hard by design standard cell libraries only to
                 protect the critical parts that account for less than
                 4\% of the entire TCLS solution. Moreover, when
                 exposure to radiation is relatively low, such as in
                 terrestrial applications or even satellites operating
                 in Low Earth Orbits (LEO), the system could be
                 implemented entirely using commercial cell libraries,
                 relying on the radiation mitigation methods implemented
                 on the TCLS to cope with sporadic soft errors in its
                 critical parts. The TCLS solution allows thus to
                 significantly reduce chip manufacturing costs and keep
                 pace with advances in low power consumption and high
                 density integration by leveraging commercial
                 semiconductor processes, while matching the reliability
                 levels and improving availability that can be achieved
                 using extremely expensive rad-hard semiconductor
                 processes. Finally, the article describes a TRL4
                 proof-of-concept TCLS-based System-on-Chip (SoC) that
                 has been prototyped and tested to power the computer
                 on-board an Airbus Defence and Space telecom satellite.
                 When compared to the currently used processor solution
                 by Airbus, the TCLS-based SoC results in a more than 5$
                 \times $ performance increase and cuts power
                 consumption by more than half.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Ainsworth:2019:SPI,
  author =       "Sam Ainsworth and Timothy M. Jones",
  title =        "Software Prefetching for Indirect Memory Accesses: a
                 Microarchitectural Perspective",
  journal =      j-TOCS,
  volume =       "36",
  number =       "3",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3319393",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:30 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3319393",
  abstract =     "Many modern data processing and HPC workloads are
                 heavily memory-latency bound. A tempting proposition to
                 solve this is software prefetching, where special
                 non-blocking loads are used to bring data into the
                 cache hierarchy just before being required. However,
                 these are difficult to insert to effectively improve
                 performance, and techniques for automatic insertion are
                 currently limited. This article develops a novel
                 compiler pass to automatically generate software
                 prefetches for indirect memory accesses, a special
                 class of irregular memory accesses often seen in
                 high-performance workloads. We evaluate this across a
                 wide set of systems, all of which gain benefit from the
                 technique. We then evaluate the extent to which good
                 prefetch instructions are architecture dependent and
                 the class of programs that are particularly amenable.
                 Across a set of memory-bound benchmarks, our automated
                 pass achieves average speedups of 1.3$ \times $ for an
                 Intel Haswell processor, 1.1$ \times $ for both an ARM
                 Cortex-A57 and Qualcomm Kryo, 1.2$ \times $ for a
                 Cortex-72 and an Intel Kaby Lake, and 1.35$ \times $
                 for an Intel Xeon Phi Knight's Landing, each of which
                 is an out-of-order core, and performance improvements
                 of 2.1$ \times $ and 2.7$ \times $ for the in-order ARM
                 Cortex-A53 and first generation Intel Xeon Phi.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Chen:2019:ISA,
  author =       "Yunji Chen and Huiying Lan and Zidong Du and Shaoli
                 Liu and Jinhua Tao and Dong Han and Tao Luo and Qi Guo
                 and Ling Li and Yuan Xie and Tianshi Chen",
  title =        "An Instruction Set Architecture for Machine Learning",
  journal =      j-TOCS,
  volume =       "36",
  number =       "3",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2019",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3331469",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Sep 21 11:44:30 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3331469",
  abstract =     "Machine Learning (ML) are a family of models for
                 learning from the data to improve performance on a
                 certain task. ML techniques, especially recent renewed
                 neural networks (deep neural networks), have proven to
                 be efficient for a broad range of applications. ML
                 techniques are conventionally executed on
                 general-purpose processors (such as CPU and GPGPU),
                 which usually are not energy efficient, since they
                 invest excessive hardware resources to flexibly support
                 various workloads. Consequently, application-specific
                 hardware accelerators have been proposed recently to
                 improve energy efficiency. However, such accelerators
                 were designed for a small set of ML techniques sharing
                 similar computational patterns, and they adopt complex
                 and informative instructions (control signals) directly
                 corresponding to high-level functional blocks of an ML
                 technique (such as layers in neural networks) or even
                 an ML as a whole. Although straightforward and easy to
                 implement for a limited set of similar ML techniques,
                 the lack of agility in the instruction set prevents
                 such accelerator designs from supporting a variety of
                 different ML techniques with sufficient flexibility and
                 efficiency. In this article, we first propose a novel
                 domain-specific Instruction Set Architecture (ISA) for
                 NN accelerators, called Cambricon, which is a
                 load-store architecture that integrates scalar, vector,
                 matrix, logical, data transfer, and control
                 instructions, based on a comprehensive analysis of
                 existing NN techniques. We then extend the application
                 scope of Cambricon from NN to ML techniques. We also
                 propose an assembly language, an assembler, and runtime
                 to support programming with Cambricon, especially
                 targeting large-scale ML problems. Our evaluation over
                 a total of 16 representative yet distinct ML techniques
                 have demonstrated that Cambricon exhibits strong
                 descriptive capacity over a broad range of ML
                 techniques and provides higher code density than
                 general-purpose ISAs such as x86, MIPS, and GPGPU.
                 Compared to the latest state-of-the-art NN accelerator
                 design DaDianNao [7] (which can only accommodate three
                 types of NN techniques), our Cambricon-based
                 accelerator prototype implemented in TSMC 65nm
                 technology incurs only negligible latency/power/area
                 overheads, with a versatile coverage of 10 different NN
                 benchmarks and 7 other ML benchmarks. Compared to the
                 recent prevalent ML accelerator PuDianNao, our
                 Cambricon-based accelerator is able to support all the
                 ML techniques as well as the 10 NNs but with only
                 approximate 5.1\% performance loss.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J774",
}

@Article{Bai:2020:EDS,
  author =       "Jia-Ju Bai and Julia Lawall and Shi-Min Hu",
  title =        "Effective Detection of Sleep-in-atomic-context Bugs in
                 the {Linux} Kernel",
  journal =      j-TOCS,
  volume =       "36",
  number =       "4",
  pages =        "10:1--10:30",
  month =        jun,
  year =         "2020",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3381990",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jun 12 07:20:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/linux.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3381990",
  abstract =     "Atomic context is an execution state of the Linux
                 kernel in which kernel code monopolizes a CPU core. In
                 this state, the Linux kernel may only perform
                 operations that cannot sleep, as otherwise a system
                 hang or crash may occur. We refer to this kind of
                 concurrency bug as a sleep-in-atomic-context (SAC) bug.
                 In practice, SAC bugs are hard to find, as they do not
                 cause problems in all executions.\par

                 In this article, we propose a practical static approach
                 named DSAC to effectively detect SAC bugs in the Linux
                 kernel. DSAC uses three key techniques: (1) a
                 summary-based analysis to identify the code that may be
                 executed in atomic context, (2) a connection-based
                 alias analysis to identify the set of functions
                 referenced by a function pointer, and (3) a path-check
                 method to filter out repeated reports and false bugs.
                 We evaluate DSAC on Linux 4.17 and find 1,159 SAC bugs.
                 We manually check all the bugs and find that 1,068 bugs
                 are real. We have randomly selected 300 of the real
                 bugs and sent them to kernel developers. 220 of these
                 bugs have been confirmed, and 51 of our patches fixing
                 115 bugs have been applied.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Malkhi:2020:ISI,
  author =       "Dahlia Malkhi and Dan Tsafrir",
  title =        "Introduction to the Special Issue on the Award Papers
                 of {USENIX ATC 2019}",
  journal =      j-TOCS,
  volume =       "36",
  number =       "4",
  pages =        "11:1--11:2",
  month =        jun,
  year =         "2020",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3395034",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jun 12 07:20:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3395034",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Balmau:2020:SPL,
  author =       "Oana Balmau and Florin Dinu and Willy Zwaenepoel and
                 Karan Gupta and Ravishankar Chandhiramoorthi and Diego
                 Didona",
  title =        "{SILK+} Preventing Latency Spikes in Log-Structured
                 Merge Key--Value Stores Running Heterogeneous
                 Workloads",
  journal =      j-TOCS,
  volume =       "36",
  number =       "4",
  pages =        "12:1--12:27",
  month =        jun,
  year =         "2020",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3380905",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jun 12 07:20:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380905",
  abstract =     "Log-Structured Merge Key-Value stores (LSM KVs) are
                 designed to offer good write performance, by capturing
                 client writes in memory, and only later flushing them
                 to storage. Writes are later compacted into a tree-like
                 data structure on disk to improve \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Leesatapornwongsa:2020:TWT,
  author =       "Tanakorn Leesatapornwongsa and Aritra Sengupta and
                 Masoud Saeida Ardekani and Gustavo Petri and Cesar
                 A. Stuardo",
  title =        "Transactuations: Where Transactions Meet the Physical
                 World",
  journal =      j-TOCS,
  volume =       "36",
  number =       "4",
  pages =        "13:1--13:31",
  month =        jun,
  year =         "2020",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3380907",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jun 12 07:20:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380907",
  abstract =     "A large class of IoT applications read sensors,
                 execute application logic, and actuate actuators.
                 However, the lack of high-level programming
                 abstractions compromises correctness, especially in the
                 presence of failures and unwanted interleaving between
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Spink:2020:RSL,
  author =       "Tom Spink and Harry Wagstaff and Bj{\"o}rn Franke",
  title =        "A Retargetable System-level {DBT} Hypervisor",
  journal =      j-TOCS,
  volume =       "36",
  number =       "4",
  pages =        "14:1--14:24",
  month =        jun,
  year =         "2020",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3386161",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jun 12 07:20:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3386161",
  abstract =     "System-level Dynamic Binary Translation (DBT) provides
                 the capability to boot an Operating System (OS) and
                 execute programs compiled for an Instruction Set
                 Architecture (ISA) different from that of the host
                 machine. Due to their performance-critical \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Jha:2020:CDF,
  author =       "Sagar Jha",
  title =        "Corrigendum to {``Derecho: Fast State Machine
                 Replication for Cloud Services,'' by Jha et al., ACM
                 Transactions on Computer Systems (TOCS) Volume {\bf
                 36}, Issue 2, Article No. 4}",
  journal =      j-TOCS,
  volume =       "36",
  number =       "4",
  pages =        "15:1--15:1",
  month =        jun,
  year =         "2020",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3395604",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jun 12 07:20:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  note =         "See \cite{Jha:2019:DFS}.",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3395604",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Sadrosadati:2021:HCL,
  author =       "Mohammad Sadrosadati and Amirhossein Mirhosseini and
                 Ali Hajiabadi and Seyed Borna Ehsani and Hajar Falahati
                 and Hamid Sarbazi-Azad and Mario Drumond and Babak
                 Falsafi and Rachata Ausavarungnirun and Onur Mutlu",
  title =        "Highly Concurrent Latency-tolerant Register Files for
                 {GPUs}",
  journal =      j-TOCS,
  volume =       "37",
  number =       "1--4",
  pages =        "1:1--1:36",
  month =        mar,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3419973",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue May 25 09:04:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3419973",
  abstract =     "Graphics Processing Units (GPUs) employ large register
                 files to accommodate all active threads and accelerate
                 context switching. Unfortunately, register files are a
                 scalability bottleneck for future GPUs due to long
                 access latency, high power consumption, and large
                 silicon area provisioning. Prior work proposes
                 hierarchical register file to reduce the register file
                 power consumption by caching registers in a smaller
                 register file cache. Unfortunately, this approach does
                 not improve register access latency due to the low hit
                 rate in the register file cache.\par

                 In this article, we propose the Latency-Tolerant
                 Register File (LTRF) architecture to achieve low
                 latency in a two-level hierarchical structure while
                 keeping power consumption low. We observe that
                 compile-time interval analysis enables us to divide GPU
                 program execution into intervals with an accurate
                 estimate of a warp's aggregate register working-set
                 within each interval. The key idea of LTRF is to
                 prefetch the estimated register working-set from the
                 main register file to the register file cache under
                 software control, at the beginning of each interval,
                 and overlap the prefetch latency with the execution of
                 other warps. We observe that register bank conflicts
                 while prefetching the registers could greatly reduce
                 the effectiveness of LTRF. Therefore, we devise a
                 compile-time register renumbering technique to reduce
                 the likelihood of register bank conflicts. Our
                 experimental results show that LTRF enables
                 high-capacity yet long-latency main GPU register files,
                 paving the way for various optimizations. As an example
                 optimization, we implement the main register file with
                 emerging high-density high-latency memory technologies,
                 enabling $ 8 \times $ larger capacity and improving
                 overall GPU performance by 34\%.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Zhang:2021:KSV,
  author =       "Yiming Zhang and Chengfei Zhang and Yaozheng Wang and
                 Kai Yu and Guangtao Xue and Jon Crowcroft",
  title =        "{KylinX}: Simplified Virtualization Architecture for
                 Specialized Virtual Appliances with Strong Isolation",
  journal =      j-TOCS,
  volume =       "37",
  number =       "1--4",
  pages =        "2:1--2:27",
  month =        mar,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3436512",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue May 25 09:04:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3436512",
  abstract =     "Unikernel specializes a minimalistic LibOS and a
                 target application into a standalone single-purpose
                 virtual machine (VM) running on a hypervisor, which is
                 referred to as (virtual) appliance. Compared to
                 traditional VMs, Unikernel appliances have smaller
                 memory footprint and lower overhead while guaranteeing
                 the same level of isolation. On the downside, Unikernel
                 strips off the process abstraction from its monolithic
                 appliance and thus sacrifices flexibility, efficiency,
                 and applicability.\par

                 In this article, we examine whether there is a balance
                 embracing the best of both Unikernel appliances (strong
                 isolation) and processes (high flexibility/efficiency).
                 We present KylinX, a dynamic library operating system
                 for simplified and efficient cloud virtualization by
                 providing the pVM (process-like VM) abstraction. A pVM
                 takes the hypervisor as an OS and the Unikernel
                 appliance as a process allowing both page-level and
                 library-level dynamic mapping. At the page level,
                 KylinX supports pVM fork plus a set of API for
                 inter-pVM communication (IpC, which is compatible with
                 conventional UNIX IPC). At the library level, KylinX
                 supports shared libraries to be linked to a Unikernel
                 appliance at runtime. KylinX enforces mapping
                 restrictions against potential threats. We implement a
                 prototype of KylinX by modifying MiniOS and Xen tools.
                 Extensive experimental results show that KylinX
                 achieves similar performance both in micro benchmarks
                 (fork, IpC, library update, etc.) and in applications
                 (Redis, web server, and DNS server) compared to
                 conventional processes, while retaining the strong
                 isolation benefit of VMs/Unikernels.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Richins:2021:ATH,
  author =       "Daniel Richins and Dharmisha Doshi and Matthew
                 Blackmore and Aswathy Thulaseedharan Nair and Neha
                 Pathapati and Ankit Patel and Brainard Daguman and
                 Daniel Dobrijalowski and Ramesh Illikkal and Kevin Long
                 and David Zimmerman and Vijay Janapa Reddi",
  title =        "{AI} Tax: The Hidden Cost of {AI} Data Center
                 Applications",
  journal =      j-TOCS,
  volume =       "37",
  number =       "1--4",
  pages =        "3:1--3:32",
  month =        mar,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3440689",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue May 25 09:04:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3440689",
  abstract =     "Artificial intelligence and machine learning are
                 experiencing widespread adoption in industry and
                 academia. This has been driven by rapid advances in the
                 applications and accuracy of AI through increasingly
                 complex algorithms and models; this, in turn, has
                 spurred research into specialized hardware AI
                 accelerators. Given the rapid pace of advances, it is
                 easy to forget that they are often developed and
                 evaluated in a vacuum without considering the full
                 application environment. This article emphasizes the
                 need for a holistic, end-to-end analysis of artificial
                 intelligence (AI) workloads and reveals the ``AI tax.''
                 We deploy and characterize Face Recognition in an edge
                 data center. The application is an AI-centric edge
                 video analytics application built using popular open
                 source infrastructure and machine learning (ML) tools.
                 Despite using state-of-the-art AI and ML algorithms,
                 the application relies heavily on pre- and
                 post-processing code. As AI-centric applications
                 benefit from the acceleration promised by accelerators,
                 we find they impose stresses on the hardware and
                 software infrastructure: storage and network bandwidth
                 become major bottlenecks with increasing AI
                 acceleration. By specializing for AI applications, we
                 show that a purpose-built edge data center can be
                 designed for the stresses of accelerated AI at 15\%
                 lower TCO than one derived from homogeneous servers and
                 infrastructure.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Baskin:2021:UUN,
  author =       "Chaim Baskin and Natan Liss and Eli Schwartz and
                 Evgenii Zheltonozhskii and Raja Giryes and Alex M.
                 Bronstein and Avi Mendelson",
  title =        "{UNIQ}: Uniform Noise Injection for Non-Uniform
                 Quantization of Neural Networks",
  journal =      j-TOCS,
  volume =       "37",
  number =       "1--4",
  pages =        "4:1--4:15",
  month =        mar,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3444943",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue May 25 09:04:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3444943",
  abstract =     "We present a novel method for neural network
                 quantization. Our method, named UNIQ, emulates a
                 non-uniform $k$-quantile quantizer and adapts the model
                 to perform well with quantized weights by injecting
                 noise to the weights at training time. As a by-product
                 of injecting noise to weights, we find that activations
                 can also be quantized to as low as 8-bit with only a
                 minor accuracy degradation. Our non-uniform
                 quantization approach provides a novel alternative to
                 the existing uniform quantization techniques for neural
                 networks. We further propose a novel complexity metric
                 of number of bit operations performed (BOPs), and we
                 show that this metric has a linear relation with logic
                 utilization and power. We suggest evaluating the
                 trade-off of accuracy vs. complexity (BOPs). The
                 proposed method, when evaluated on ResNet18/34/50 and
                 MobileNet on ImageNet, outperforms the prior state of
                 the art both in the low-complexity regime and the high
                 accuracy regime. We demonstrate the practical
                 applicability of this approach, by implementing our
                 non-uniformly quantized CNN on FPGA.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Zhuo:2021:DGP,
  author =       "Youwei Zhuo and Jingji Chen and Gengyu Rao and Qinyi
                 Luo and Yanzhi Wang and Hailong Yang and Depei Qian and
                 Xuehai Qian",
  title =        "Distributed Graph Processing System and
                 Processing-in-memory Architecture with Precise
                 Loop-carried Dependency Guarantee",
  journal =      j-TOCS,
  volume =       "37",
  number =       "1--4",
  pages =        "5:1--5:37",
  month =        jun,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3453681",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jul 2 08:25:18 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3453681",
  abstract =     "To hide the complexity of the underlying system, graph
                 processing frameworks ask programmers to specify graph
                 computations in user-defined functions (UDFs) of
                 graph-oriented programming model. Due to the nature of
                 distributed execution, current \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Agate:2021:SSE,
  author =       "Vincenzo Agate and Alessandra {De Paola} and Giuseppe
                 {Lo Re} and Marco Morana",
  title =        "A Simulation Software for the Evaluation of
                 Vulnerabilities in Reputation Management Systems",
  journal =      j-TOCS,
  volume =       "37",
  number =       "1--4",
  pages =        "6:1--6:30",
  month =        jun,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3458510",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Fri Jul 2 08:25:18 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458510",
  abstract =     "Multi-agent distributed systems are characterized by
                 autonomous entities that interact with each other to
                 provide, and/or request, different kinds of services.
                 In several contexts, especially when a reward is
                 offered according to the quality of service, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Ruaro:2021:MDM,
  author =       "Marcelo Ruaro and Anderson Sant'ana and Axel Jantsch
                 and Fernando Gehm Moraes",
  title =        "Modular and Distributed Management of Many-Core
                 {SoCs}",
  journal =      j-TOCS,
  volume =       "38",
  number =       "1--2",
  pages =        "1:1--1:16",
  month =        jul,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3458511",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 10 13:25:43 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458511",
  abstract =     "Many-Core Systems-on-Chip increasingly require Dynamic
                 Multi-objective Management (DMOM) of resources. DMOM
                 uses different management components for objectives and
                 resources to implement comprehensive and self-adaptive
                 system resource management. DMOMs \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Markussen:2021:SZO,
  author =       "Jonas Markussen and Lars Bj{\o}rlykke Kristiansen and
                 P{\aa}l Halvorsen and Halvor Kielland-Gyrud and
                 H{\aa}kon Kvale Stensland and Carsten Griwodz",
  title =        "{SmartIO}: Zero-overhead Device Sharing through {PCIe}
                 Networking",
  journal =      j-TOCS,
  volume =       "38",
  number =       "1--2",
  pages =        "2:1--2:78",
  month =        jul,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3462545",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 10 13:25:43 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462545",
  abstract =     "The large variety of compute-heavy and data-driven
                 applications accelerate the need for a distributed I/O
                 solution that enables cost-effective scaling of
                 resources between networked hosts. For example, in a
                 cluster system, different machines may have \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Katsikas:2021:MHP,
  author =       "Georgios P. Katsikas and Tom Barbette and Dejan
                 Kosti{\'c} and Gerald Q. {Maguire, Jr.} and Rebecca
                 Steinert",
  title =        "{Metron}: High-performance {NFV} Service Chaining Even
                 in the Presence of Blackboxes",
  journal =      j-TOCS,
  volume =       "38",
  number =       "1--2",
  pages =        "3:1--3:45",
  month =        jul,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3465628",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 10 13:25:43 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3465628",
  abstract =     "Deployment of 100Gigabit Ethernet (GbE) links
                 challenges the packet processing limits of commodity
                 hardware used for Network Functions Virtualization
                 (NFV). Moreover, realizing chained network functions
                 (i.e., service chains) necessitates the use of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Zuo:2021:SIS,
  author =       "Zhiqiang Zuo and Kai Wang and Aftab Hussain and
                 Ardalan Amiri Sani and Yiyu Zhang and Shenming Lu and
                 Wensheng Dou and Linzhang Wang and Xuandong Li and
                 Chenxi Wang and Guoqing Harry Xu",
  title =        "Systemizing Interprocedural Static Analysis of
                 Large-scale Systems Code with {Graspan}",
  journal =      j-TOCS,
  volume =       "38",
  number =       "1--2",
  pages =        "4:1--4:39",
  month =        jul,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3466820",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Tue Aug 10 13:25:43 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3466820",
  abstract =     "There is more than a decade-long history of using
                 static analysis to find bugs in systems such as Linux.
                 Most of the existing static analyses developed for
                 these systems are simple checkers that find bugs based
                 on pattern matching. Despite the presence \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Song:2021:ANF,
  author =       "Won Wook Song and Youngseok Yang and Jeongyoon Eo and
                 Jangho Seo and Joo Yeon Kim and Sanha Lee and Gyewon
                 Lee and Taegeon Um and Haeyoon Cho and Byung-Gon Chun",
  title =        "{Apache Nemo}: a Framework for Optimizing Distributed
                 Data Processing",
  journal =      j-TOCS,
  volume =       "38",
  number =       "3--4",
  pages =        "5:1--5:31",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3468144",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Apr 18 11:45:45 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468144",
  abstract =     "Optimizing scheduling and communication of distributed
                 data processing for resource and data characteristics
                 is crucial for achieving high performance. Existing
                 approaches to such optimizations largely fall into two
                 categories. First, distributed runtimes \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Canakci:2021:SMB,
  author =       "Burcu Canakci and Robbert {Van Renesse}",
  title =        "Scaling Membership of {Byzantine} Consensus",
  journal =      j-TOCS,
  volume =       "38",
  number =       "3--4",
  pages =        "6:1--6:31",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3473138",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Apr 18 11:45:45 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473138",
  abstract =     "Scaling Byzantine Fault Tolerant (BFT) systems in
                 terms of membership is important for secure
                 applications with large participation such as
                 blockchains. While traditional protocols have low
                 latency, they cannot handle many processors.
                 Conversely, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Kumar:2021:SSF,
  author =       "Rakesh Kumar and Boris Grot",
  title =        "Shooting Down the Server Front-End Bottleneck",
  journal =      j-TOCS,
  volume =       "38",
  number =       "3--4",
  pages =        "7:1--7:30",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3484492",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Mon Apr 18 11:45:45 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3484492",
  abstract =     "The front-end bottleneck is a well-established problem
                 in server workloads owing to their deep software stacks
                 and large instruction footprints. Despite years of
                 research into effective L1-I and BTB prefetching,
                 state-of-the-art techniques force a trade-. \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Lyerly:2021:ORT,
  author =       "Robert Lyerly and Carlos Bilbao and Changwoo Min and
                 Christopher J. Rossbach and Binoy Ravindran",
  title =        "An {OpenMP} Runtime for Transparent Work Sharing
                 across Cache-Incoherent Heterogeneous Nodes",
  journal =      j-TOCS,
  volume =       "39",
  number =       "1--4",
  pages =        "1:1--1:??",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3505224",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505224",
  abstract =     "In this work, we present libHetMP, an OpenMP runtime
                 for automatically and transparently distributing
                 parallel computation across heterogeneous \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Chen:2021:UHM,
  author =       "Lei Chen and Jiacheng Zhao and Chenxi Wang and Ting
                 Cao and John Zigman and Haris Volos and Onur Mutlu and
                 Fang Lv and Xiaobing Feng and Guoqing Harry Xu and
                 Huimin Cui",
  title =        "Unified Holistic Memory Management Supporting Multiple
                 Big Data Processing Frameworks over Hybrid Memories",
  journal =      j-TOCS,
  volume =       "39",
  number =       "1--4",
  pages =        "2:1--2:??",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3511211",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511211",
  abstract =     "To process real-world datasets, modern data-parallel
                 systems often require extremely large amounts of
                 memory, which are both costly and energy \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Boroujerdian:2021:RCA,
  author =       "Behzad Boroujerdian and Hasan Genc and Srivatsan
                 Krishnan and Bardienus Pieter Duisterhof and Brian
                 Plancher and Kayvan Mansoorshahi and Marcelino Almeida
                 and Wenzhi Cui and Aleksandra Faust and Vijay Janapa
                 Reddi",
  title =        "The Role of Compute in Autonomous Micro Aerial
                 Vehicles: Optimizing for Mission Time and Energy
                 Efficiency",
  journal =      j-TOCS,
  volume =       "39",
  number =       "1--4",
  pages =        "3:1--3:??",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3511210",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511210",
  abstract =     "Autonomous and mobile cyber-physical machines are
                 becoming an inevitable part of our future. In
                 particular, Micro Aerial Vehicles (MAVs) have seen a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Blocher:2021:RAO,
  author =       "Marcel Bl{\"o}cher and Emilio Coppa and Pascal Kleber
                 and Patrick Eugster and William Culhane and Masoud
                 Saeida Ardekani",
  title =        "{ROME}: All Overlays Lead to Aggregation, but Some Are
                 Faster than Others",
  journal =      j-TOCS,
  volume =       "39",
  number =       "1--4",
  pages =        "4:1--4:??",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3516430",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3516430",
  abstract =     "Aggregation is common in data analytics and crucial to
                 distilling information from large datasets, but current
                 data analytics frameworks do not fully exploit
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Xing:2021:HCE,
  author =       "Tong Xing and Antonio Barbalace and Pierre Olivier and
                 Mohamed L. Karaoui and Wei Wang and Binoy Ravindran",
  title =        "{H-Container}: Enabling Heterogeneous-{ISA} Container
                 Migration in Edge Computing",
  journal =      j-TOCS,
  volume =       "39",
  number =       "1--4",
  pages =        "5:1--5:??",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3524452",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524452",
  abstract =     "Edge computing is a recent computing paradigm that
                 brings cloud services closer to the client. Among other
                 features, edge computing offers extremely low
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Xia:2021:BIP,
  author =       "Yubin Xia and Dong Du and Zhichao Hua and Binyu Zang
                 and Haibo Chen and Haibing Guan",
  title =        "Boosting Inter-process Communication with
                 Architectural Support",
  journal =      j-TOCS,
  volume =       "39",
  number =       "1--4",
  pages =        "6:1--6:??",
  month =        nov,
  year =         "2021",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3532861",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532861",
  abstract =     "IPC (inter-process communication) is a critical
                 mechanism for modern OSes, including not only
                 microkernels such as seL4, QNX, and Fuchsia where
                 system \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Diavastos:2022:EIS,
  author =       "Andreas Diavastos and Trevor E. Carlson",
  title =        "Efficient Instruction Scheduling Using Real-time Load
                 Delay Tracking",
  journal =      j-TOCS,
  volume =       "40",
  number =       "1--4",
  pages =        "1:1--1:??",
  month =        nov,
  year =         "2022",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3548681",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548681",
  abstract =     "Issue time prediction processors use dataflow
                 dependencies and predefined instruction latencies to
                 predict issue times of repeated instructions. In this
                 work, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Biswas:2022:UPR,
  author =       "Arnab Kumar Biswas",
  title =        "Using Pattern of On-Off Routers and Links and Router
                 Delays to Protect Network-on-Chip Intellectual
                 Property",
  journal =      j-TOCS,
  volume =       "40",
  number =       "1--4",
  pages =        "2:1--2:??",
  month =        nov,
  year =         "2022",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3548680",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Dec 8 06:35:07 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548680",
  abstract =     "Intellectual Property (IP) reuse is a well known
                 practice in chip design processes. Nowadays,
                 network-on-chips (NoCs) are increasingly used as IP and
                 sold by \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Alkhatib:2023:PNP,
  author =       "Basil Alkhatib and Sreeharsha Udayashankar and Sara
                 Qunaibi and Ahmed Alquraan and Mohammed Alfatafta and
                 Wael Al-Manasrah and Alex Depoutovitch and Samer
                 Al-Kiswany",
  title =        "Partial Network Partitioning",
  journal =      j-TOCS,
  volume =       "41",
  number =       "1--4",
  pages =        "1:1--1:??",
  month =        nov,
  year =         "2023",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3576192",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 3 11:39:05 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3576192",
  abstract =     "We present an extensive study focused on partial
                 network partitioning. Partial network partitions
                 disrupt the communication between some but not all
                 nodes in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Sheff:2023:CRB,
  author =       "Isaac Sheff and Xinwen Wang and Kushal Babel and
                 Haobin Ni and Robbert van Renesse and Andrew C. Myers",
  title =        "{Charlotte}: Reformulating Blockchains into a {Web} of
                 Composable Attested Data Structures for Cross-Domain
                 Applications",
  journal =      j-TOCS,
  volume =       "41",
  number =       "1--4",
  pages =        "2:1--2:??",
  month =        nov,
  year =         "2023",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3607534",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 3 11:39:05 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3607534",
  abstract =     "Cross- domain applications are rapidly adopting
                 blockchain techniques for immutability, availability,
                 integrity, and interoperability. However, for most
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Park:2023:FFM,
  author =       "Jonggyu Park and Young Ik Eom",
  title =        "Filesystem Fragmentation on Modern Storage Systems",
  journal =      j-TOCS,
  volume =       "41",
  number =       "1--4",
  pages =        "3:1--3:??",
  month =        nov,
  year =         "2023",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3611386",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 3 11:39:05 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3611386",
  abstract =     "Filesystem fragmentation has been one of the primary
                 reasons for computer systems to get slower over time.
                 However, there have been rapid changes in modern
                 storage systems over the past decades, and modern
                 storage devices such as solid state drives have
                 different mechanisms to access data, compared with
                 traditional rotational ones. In this article, we
                 revisit filesystem fragmentation on modern computer
                 systems from both performance and fairness
                 perspectives. According to our extensive experiments,
                 filesystem fragmentation not only degrades I/O
                 performance of modern storage devices, but also incurs
                 various problems related to I/O fairness, such as
                 performance interference. Unfortunately, conventional
                 defragmentation tools are designed primarily for hard
                 disk drives and thus generate an unnecessarily large
                 amount of I/Os for data migration. To mitigate such
                 problems, this article present FragPicker, a new
                 defragmentation tool for modern storage devices.
                 FragPicker analyzes the I/O behaviors of each target
                 application and defragments only necessary pieces of
                 data whose migration can contribute to performance
                 improvement, thereby effectively minimizing the I/O
                 amount for defragmentation. Our evaluation with YCSB
                 workload-C shows FragPicker reduces the total amount of
                 I/O for defragmentation by around 66\% and the elapsed
                 time by around 84\%, while showing a similar level of
                 defragmentation effect.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Pellauer:2023:SOS,
  author =       "Michael Pellauer and Jason Clemons and Vignesh Balaji
                 and Neal Crago and Aamer Jaleel and Donghyuk Lee and
                 Mike O'Connor and Anghsuman Parashar and Sean Treichler
                 and Po-An Tsai and Stephen W. Keckler and Joel S.
                 Emer",
  title =        "{Symphony}: Orchestrating Sparse and Dense Tensors
                 with Hierarchical Heterogeneous Processing",
  journal =      j-TOCS,
  volume =       "41",
  number =       "1--4",
  pages =        "4:1--4:??",
  month =        nov,
  year =         "2023",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3630007",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 3 11:39:05 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630007",
  abstract =     "Sparse tensor algorithms are becoming widespread,
                 particularly in the domains of deep learning, graph and
                 data analytics, and scientific computing. Current
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Zhao:2023:MIB,
  author =       "Jie Zhao and Jinchen Xu and Peng Di and Wang Nie and
                 Jiahui Hu and Yanzhi Yi and Sijia Yang and Zhen Geng
                 and Renwei Zhang and Bojie Li and Zhiliang Gan and
                 Xuefeng Jin",
  title =        "Modeling the Interplay between Loop Tiling and Fusion
                 in Optimizing Compilers Using Affine Relations",
  journal =      j-TOCS,
  volume =       "41",
  number =       "1--4",
  pages =        "5:1--5:??",
  month =        nov,
  year =         "2023",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3635305",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Sat Feb 3 11:39:05 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635305",
  abstract =     "Loop tiling and fusion are two essential
                 transformations in optimizing compilers to enhance the
                 data locality of programs. Existing heuristics either
                 perform loop tiling and fusion in a particular order,
                 missing some of their profitable compositions, or
                 execute ad-hoc implementations for domain-specific
                 applications, calling for a generalized and systematic
                 solution in optimizing compilers.\par

                 In this article, we present a so-called {\em basteln\/}
                 (an abbreviation for backward slicing of tiled loop
                 nests) strategy in polyhedral compilation to better
                 model the interplay between loop tiling and fusion. The
                 basteln strategy first groups loop nests by preserving
                 their parallelism\slash tilability and next performs
                 rectangular\slash parallelogram tiling to the output
                 groups that produce data consumed outside the
                 considered program fragment. The memory footprints
                 required by each tile are then computed, from which the
                 upward exposed data are extracted to determine the tile
                 shapes of the remaining fusion groups. Such a tiling
                 mechanism can construct complex tile shapes imposed by
                 the dependences between these groups, which are further
                 merged by a post-tiling fusion algorithm for enhancing
                 data locality without losing the parallelism\slash
                 tilability of the output groups. The basteln strategy
                 also takes into account the amount of redundant
                 computations and the fusion of independent groups,
                 exhibiting a general applicability.\par

                 We integrate the basteln strategy into two optimizing
                 compilers, with one a general-purpose optimizer and the
                 other a domain-specific compiler for deploying deep
                 learning models. The experiments are conducted on CPU,
                 GPU, and a deep learning accelerator to demonstrate the
                 effectiveness of the approach for a wide class of
                 application domains, including deep learning, image
                 processing, sparse matrix computation, and linear
                 algebra. In particular, the basteln strategy achieves a
                 mean speedup of $ 1.8 \times $ over cuBLAS\slash cuDNN
                 and $ 1.1 \times $ over TVM on GPU when used to
                 optimize deep learning models; it also outperforms PPCG
                 and TVM by 11\% and 20\%, respectively, when generating
                 code for the deep learning accelerator.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Luo:2024:ORM,
  author =       "Shutian Luo and Chenyu Lin and Kejiang Ye and Guoyao
                 Xu and Liping Zhang and Guodong Yang and Huanle Xu and
                 Chengzhong Xu",
  title =        "Optimizing Resource Management for Shared
                 Microservices: a Scalable System Design",
  journal =      j-TOCS,
  volume =       "42",
  number =       "1--2",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2024",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3631607",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 16 10:49:47 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631607",
  abstract =     "A common approach to improving resource utilization in
                 data centers is to adaptively provision resources based
                 on the actual workload. One fundamental challenge of
                 doing this in microservice management frameworks,
                 however, is that different components of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Zhao:2024:CDC,
  author =       "Laiping Zhao and Yushuai Cui and Yanan Yang and Xiaobo
                 Zhou and Tie Qiu and Keqiu Li and Yungang Bao",
  title =        "Component-distinguishable Co-location and Resource
                 Reclamation for High-throughput Computing",
  journal =      j-TOCS,
  volume =       "42",
  number =       "1--2",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2024",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3630006",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 16 10:49:47 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630006",
  abstract =     "Cloud service providers improve resource utilization
                 by co-locating latency-critical (LC) workloads with
                 best-effort batch (BE) jobs in datacenters. However,
                 they usually treat multi-component LCs as monolithic
                 applications and treat BEs as ``second-class
                 \ldots{}''",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Kappes:2024:DFU,
  author =       "Giorgos Kappes and Stergios V. Anastasiadis",
  title =        "{Diciclo}: Flexible User-level Services for Efficient
                 Multitenant Isolation",
  journal =      j-TOCS,
  volume =       "42",
  number =       "1--2",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2024",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3639404",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 16 10:49:47 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639404",
  abstract =     "Containers are a mainstream virtualization technique
                 for running stateful workloads over persistent storage.
                 In highly utilized multitenant hosts, resource
                 contention at the system kernel leads to inefficient
                 container input/output (I/O) handling. Although there
                 are interesting techniques to address this issue, they
                 incur high implementation complexity and execution
                 overhead. As a cost-effective alternative, we introduce
                 the Diciclo architecture with our assumptions, goals,
                 and principles. For each tenant, Diciclo isolates the
                 control and data I/O path at user level and runs
                 dedicated storage systems. Diciclo includes the
                 libservice unified user-level abstraction of system
                 services and the node structure design pattern for the
                 application and server side. We prototyped a toolkit of
                 user-level components that comprise the library to
                 invoke the standard I/O calls, the I/O communication
                 mechanism, and the I/O services. Based on Diciclo, we
                 built Danaus, a filesystem client that integrates a
                 union filesystem with a Ceph distributed filesystem
                 client and configurable shared cache. Across different
                 host configurations, workloads, and systems, Danaus
                 achieves improved performance stability, because it
                 handles I/O with reserved per-tenant resources and
                 avoids intensive kernel locking. Based on having built
                 and evaluated Danaus, we share valuable lessons about
                 resource contention, file management, service
                 separation, and performance stability in multitenant
                 systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

@Article{Sha:2024:HSC,
  author =       "Sai Sha and Chuandong Li and Xiaolin Wang and Zhenlin
                 Wang and Yingwei Luo",
  title =        "Hardware--Software Collaborative Tiered-Memory
                 Management Framework for Virtualization",
  journal =      j-TOCS,
  volume =       "42",
  number =       "1--2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2024",
  CODEN =        "ACSYEC",
  DOI =          "https://doi.org/10.1145/3639564",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu May 16 10:49:47 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tocs.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639564",
  abstract =     "The tiered-memory system can effectively expand the
                 memory capacity for virtual machines (VMs). However,
                 virtualization introduces new challenges specifically
                 in enforcing performance isolation, minimizing context
                 switching, and providing resource overcommit. None of
                 the state-of-the-art designs consider virtualization
                 and address these challenges; we observe that a VM with
                 tiered memory incurs up to a $ 2 \times $ slowdown
                 compared to a DRAM-only VM.\par

                 We propose vTMM, a hardware-software collaborative
                 tiered-memory management framework for virtualization.
                 A key insight in vTMM is to leverage the unique system
                 features in virtualization to meet the above
                 challenges. vTMM automatically determines page hotness
                 and migrates pages between fast and slow memory to
                 achieve better performance. Specially, vTMM optimizes
                 page tracking and migration based on page-modification
                 logging (PML), a hardware-assisted virtualization
                 mechanism, and adaptively distinguishes hot/cold pages
                 through the page ``temperature'' sorting. vTMM also
                 dynamically adjusts fast memory among multi-VMs on
                 demand by using a memory pool. Further, vTMM tracks
                 huge pages at regular-page granularity in hardware and
                 splits/merges pages in software, realizing
                 hybrid-grained page management and optimization. We
                 implement and evaluate vTMM with single-grained page
                 management on an Intel processor, and the
                 hybrid-grained page management on a Sunway processor
                 with hardware mode supporting hardware/software
                 co-designs. Experiments show that vTMM outperforms
                 existing tiered-memory management designs in
                 virtualization.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Comput. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Computer Systems",
  journal-URL =  "https://dl.acm.org/loi/tocs",
}

%%% ====================================================================
%%% Proceedings entries must come last:
@Proceedings{ACM:1988:ASS,
  editor =       "ACM",
  booktitle =    "{1988 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles}",
  title =        "{1988 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles}",
  volume =       "6(1)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "??--??",
  month =        feb,
  year =         "1988",
  CODEN =        "ACSYEC",
  ISSN =         "0734-2071 (print), 1557-7333 (electronic)",
  ISSN-L =       "0734-2071",
  bibdate =      "Thu Jan 14 06:47:30 MST 1999",
  bibsource =    "Compendex database;
                 https://www.math.utah.edu/pub/tex/bib/tocs.bib",
  series =       j-TOCS,
  abstract =     "This issue contains 6 conference papers. The topics
                 covered are: stored-voice management in the Etherphone
                 system; 801 storage; scale and performance of a
                 distributed file system; recovery performance in
                 QuickSilver; fine-grained mobility in the Emerald
                 system; caching in the Sprite network file system.",
  acknowledgement = ack-nhfb,
  classification = "723",
  conference =   "1988 ACM\slash SIGOPS Symposium on Operating Systems
                 Principles.",
  keywords =     "801 storage; computer architecture; computer networks;
                 computer operating systems; computer systems, digital
                 --- Distributed; Emerald system; Etherphone system;
                 QuickSilver; Sprite network file system",
  pagecount =    "154",
  sponsor =      "ACM, Special Interest Group on Operating Systems, New
                 York, NY, USA",
}