@Preamble{ "\ifx \undefined \booktitle \def \booktitle#1{{{\em #1}}} \fi" }
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}
@Article{Alvarez:2002:IRF,
author = "C. Alvarez and J. Corbal and E. Salami and M. Valero",
title = "Initial Results on Fuzzy Floating Point Computation
for Multimedia Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "1--1",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "During the recent years the market of mid low end
portable systems such as PDAs or mobile digital phones
have experimented a revolution in both selling volume
and features as handheld devices incorporate Multimedia
applications. This fact brings to an increase in the
computational demands of the devices while still having
the limitation of power and energy consumption.
Instruction memoization is a promising technique to
help alleviate the problem of power consumption of
expensive functional units such as the floating point
one. Unfortunately this technique could be energy
inefficient for low end systems due to the additional
power consumption of the relatively big tables
required. In this paper we present a novel way of
understanding multimedia floating point operations
based on the fuzzy computation paradigm losses in the
computation precision may exchange performance for
negligible errors in the output. Exploiting the
implicit characteristics of media FP computation we
propose a new technique called fuzzy memoization. Fuzzy
memoization expands the capabilities of classic
memoization by attaching entries with similar inputs to
the same output. We present a case of study for a SH
like processor and report good performance and power
delay improvements with feasible hardware
requirements",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Delay; Energy consumption; Fuzzy systems; Handheld
computers; Joining processes; Mobile computing;
Multimedia systems; Performance loss; Personal digital
assistants; Portable computers",
}
@Article{Gordon-Ross:2002:EFP,
author = "A. Gordon-Ross and S. Cotterell and F. Vahid",
title = "Exploiting Fixed Programs in Embedded Systems: a Loop
Cache Example",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "2--2",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Embedded systems commonly execute one program for
their lifetime. Designing embedded system architectures
with configurable components, such that those
components can be tuned to that one program based on a
program pre-analysis, can yield significant power and
performance benefits. We illustrate such benefits by
designing a loop cache specifically with tuning in
mind. Our results show a 70\% reduction in instruction
memory access, for MIPS and 8051 processors
representing twice the reduction from a regular loop
cache, translating to good power savings.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture tuning; Computer architecture; Computer
science; Costs; Digital cameras; Embedded computing;
Embedded system; embedded systems.; fixed program; Loop
cache; low power; Microcomputers; Microprocessor chips;
Portable computers; Power engineering computing",
}
@Article{Choi:2002:LPT,
author = "Jin-Hyuck Choi and Jung-Hoon Lee and Seh-Woong Jeong
and Shin-Dug Kim and C. Weems",
title = "A Low Power {TLB} Structure for Embedded Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "3--3",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present a new two-level TLB (translation look-aside
buffer) architecture that integrates a 2-way banked
filter TLB with a 2-way banked main TLB. The objective
is to reduce power consumption in embedded processors
by distributing the accesses to TLB entries across the
banks in a balanced manner. First, an advanced
filtering technique is devised to reduce access power
by adopting a sub-bank structure. Second, a
bank-associative structure is applied to each level of
the TLB hierarchy. Simulation results show that the
Energy*Delay product can be reduced by about 40.9\%
compared to a fully associative TLB, 24.9\% compared to
a micro-TLB with 4+32 entries, and 12.18\% compared to
a micro-TLB with 16+32 entries.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bank associative structure; CADCAM; Circuits; Computer
aided manufacturing; Degradation; Embedded system;
Energy consumption; Filter bank; filter mechanism;
Filtering; low power design; Power filters; translation
look-aside buffer; Virtual private networks",
}
@Article{Towles:2002:WCT,
author = "B. Towles and W. J. Dally",
title = "Worst-case Traffic for Oblivious Routing Functions",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "4--4",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents an algorithm to find a worst-case
traffic pattern for any oblivious routing algorithm on
an arbitrary interconnection network topology. The
linearity of channel loading offered by oblivious
routing algorithms enables the problem to be mapped to
a bipartite maximum-weight matching, which can be
solved in polynomial time for routing functions with a
polynomial number of paths. Finding exact worst case
performance was previously intractable, and we
demonstrate an example case where traditional
characterization techniques overestimate the throughput
of a particular routing algorithm by 47\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bipartite graph; Linearity; Multiprocessor
interconnection networks; Network topology; oblivious
routing; Pattern matching; Polynomials; Routing;
Telecommunication traffic; Throughput; worst-case
throughput",
}
@Article{Unsal:2002:CFC,
author = "O. S. Unsal and C. M. Krishna and C. A. Mositz",
title = "{Cool-Fetch}: Compiler-Enabled Power-Aware Fetch
Throttling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "5--5",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we present an architecture compiler
based approach to reduce energy consumption in the
processor. While we mainly target the fetch unit, an
important side-effect of our approach is that we obtain
energy savings in many other parts in the processor.
The explanation is that the fetch unit often runs
substantially ahead of execution, bringing in
instructions to different stages in the processor that
may never be executed. We have found, that although the
degree of Instruction Level Parallelism (ILP)of a
program tends to vary over time, it can be statically
predicted by the compiler with considerable accuracy.
Our Instructions Per Clock (IPC) prediction scheme is
using a dependence-testing-based analysis and simple
heuristics, to guide a front-end fetch-throttling
mechanism. We develop the necessary architecture
support and include its power overhead. We perform
experiments over a wide number of architectural
configurations, using SPEC2000 applications. Our
results are very encouraging: we obtain up to 15\%total
energy savings in the processor with generally little
performance degradation. In fact, in some cases our
intelligent throttling scheme even increases
performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; compiler architecture interaction;
Degradation; Energy consumption; fetch-throttling;
instruction level parallelism; Low power design;
Program processors",
}
@Article{Shang:2002:PEI,
author = "Li Shang and L. Peh and N. K. Jha",
title = "Power-efficient Interconnection Networks: Dynamic
Voltage Scaling with Links",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "6--6",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power consumption is a key issue in high performance
interconnection network design. Communication links,
already a significant consumer of power now, will take
up an ever larger portion of the power budget as demand
for network bandwidth increases. In this paper, we
motivate the use of dynamic voltage scaling (DVS) for
links, where the frequency and voltage of links are
dynamically adjusted to minimize power consumption. We
propose a history-based DVS algorithm that judiciously
adjusts DVS policies based on past link utilization.
Despite every conservative assumptions about DVS link
characteristics, our approach realizes up to 4.5X power
savings (3.2X average), with just an average 27.4\%
latency increase and 2.5\% throughput reduction. To the
best of our knowledge, this is the first study that
targets dynamic power optimization of interconnection
networks.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Dynamic voltage scaling; Frequency
synthesizers; interconnection network; Multiprocessor
interconnection networks; power optimization.;
Regulators",
}
@Article{KleinOsowski:2002:MNS,
author = "A. J. KleinOsowski and D. J. Lilja",
title = "{MinneSPEC}: a New {SPEC} Benchmark Workload for
Simulation-Based Computer Architecture Research",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "7--7",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Computer architects must determine how to most
effectively use finite computational resources when
running simulations to evaluate new architectural
ideas. To facilitate efficient simulations with a range
of benchmark programs, we have developed the MinneSPEC
input set for the SPEC CPU 2000 benchmark suite. This
new workload allows computer architects to obtain
simulation results in a reasonable time using existing
simulators. While the MinneSPEC workload is derived
from the standard SPEC CPU 2000 work load, it is a
valid benchmark suite in and of itself for
simulation-based research. MinneSPEC also may be used
to run large numbers of simulations to find ``sweet
spots'' in the evaluation parameters pace. This small
number of promising design points subsequently may be
investigated in more detail with the full SPEC
reference workload. In the process of developing the
MinneSPEC datasets, we quantify its differences in
terms of function-level execution patterns, instruction
mixes, and memory behaviors compared to the SPEC
programs when executed with the reference inputs. We
find that for some programs, the MinneSPEC profiles
match the SPEC reference dataset program behavior very
closely. For other programs, however, the MinneSPEC
inputs produce significantly different program
behavior. The MinneSPEC workload has been recognized by
SPEC and is distributed with Version 1.2 and higher of
the SPEC CPU 2000 benchmark suite.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; Computer architecture;
Computer simulation",
}
@Article{Vandierendonck:2002:ATC,
author = "H. Vandierendonck and K. {De Bosschere}",
title = "An Address Transformation Combining Block- and
Word-Interleaving",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "8--8",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As future superscalar processors employ higher issue
widths, an increasing number of load/store instructions
needs to be executed each cycle to sustain high
performance. Multi-bank data caches attempt to address
this issue in a cost-effective way. R multi-bank cache
consists of multiple cache banks that each support one
load/store instruction per clock cycle. The
interleaving of cache blocks over the banks is of
primary importance. Two common choices are
block-interleaving and word-interleaving. AC through
word-interleaving leads to higher PC, it is more
expensive to implement than block-interleaving since it
requires the tag array of the cache to be multi-ported.
By swapping the bits in the effective address that are
used by word-interleaving with those used by
block-interleaving, it is possible to implement a
word-interleaved cache with the same cost, cycle time
and power consumption of a block interleaved cache.
Because this makes the L1 data cache blocks sparse,
additional costs are incurred at different levels of
the memory hierarchy.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Block-Interleaving; Clocks; Costs; Data cache; Energy
consumption; Interleaved codes; Multi-Banking;
Word-Interleaving.",
}
@Article{Tambat:2002:PLB,
author = "S. Tambat and S. Vajapeyam",
title = "Page-Level Behavior of Cache Contention",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "9--9",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cache misses in small, limited-associativity primary
caches very often replace live cache blocks, given the
dominance of capacity and conflict misses. Towards
motivating novel cache organizations, we study the
comparative characteristics of the virtual memory
address pairs involved in typical primary-cache
contention (block replacements) for the SPEC2000integer
benchmarks. We focus on the cache tag bits, and results
show that (i) often just a few tag bits differ between
contending addresses, and (ii) accesses to certain
segments or page groups of the virtual address space
(i.e., certain tag-bit groups) contend frequently.
Cache conscious virtual address space allocation can
further reduce the number of conflicting tag bits. We
mention two directions for exploiting such page-level
contention patterns to improve cache cost and
performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Automation; Benchmark testing; Cache Contention; Cache
Tags; Computer science; Data Cache; Libraries; Memory
Access Characterization; Microprocessors; Optimizing
compilers; Traffic control; Workstations",
}
@Article{Juang:2002:IDT,
author = "Philo Juang and P. Diodato and S. Kaxiras and K.
Skadron and Zhigang Hu and M. Martonosi and D. W.
Clark",
title = "Implementing Decay Techniques using {4T} Quasi-Static
Memory Cells",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "10--10",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes the use of four-transistor (4T)
cache and branch predictor array cell designs to
address increasing worries regarding leakage power
dissipation. While 4T designs lose state when
infrequently accessed, they have very low leakage,
smaller area, and no capacitive loads to switch. This
short paper gives an overview of 4T implementation
issues and a preliminary evaluation of leakage-energy
savings that shows improvements of 60-80\%",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Circuit simulation; Delay; Leakage current; Libraries;
Microarchitecture; Power dissipation; Power generation;
Random access memory; Switches; Transistors",
}
@Article{Sohn:2002:RRE,
author = "YoungChul Sohn and NaiHoon Jung and Seungryoul Maeng",
title = "Request Reordering to Enhance the Performance of
Strict Consistency Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "11--11",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advances in ILP techniques enable strict consistency
models to relax memory order through speculative
execution of memory operations. However, ordering
constraints still hinder the performance because
speculatively executed operations cannot be committed
out of program order for the possibility of
mis-speculation. In this paper, we propose a new
technique which allows memory operations to be
non-speculatively committed out of order without
violating consistency constraints.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ILP; memory consistency model; multiprocessor",
}
@Article{Shaw:2002:MSC,
author = "K. A. Shaw and W. J. Dally",
title = "Migration in Single Chip Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "12--12",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Global communication costs in future single-chip
multiprocessors will increase linearly with distance.
In this paper, we revisit the issues of locality and
load balance in order to take advantage of these new
costs. We present a technique which simultaneously
migrates data and threads based on vectors specifying
locality and resource usage. This technique improves
performance on applications with distinguishable
locality and imbalanced resource usage. 64\% of the
ideal reduction in execution time was achieved on an
application with these traits while no improvement was
obtained on a balanced application with little
locality.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cost function; Delay; Global communication;
Laboratories; Logic; Monitoring; Multiprocessing
systems; Wire",
}
@Article{Sihn:2003:SCS,
author = "K.-H. Sihn and Joonwon Lee and Jung-Wan Cho",
title = "A Speculative Coherence Scheme using Decoupling
Synchronization for Multiprocessor Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "1--1",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes a new speculative coherence
scheme, SCDS, for hardware distributed shared memory
systems to reduce the overhead of coherence action in
directory-based cache-coherence protocol. SCDS has two
main features, predicting accurate timing of
speculative coherence with synchronization information
and detecting write pattern(migratory and
non-migratory) for exclusive blocks' speculative
coherence action. In our simulation, SCDS outperforms
existing schemes (DSI and LTP) for well-synchronized
applications.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Access protocols; Coherence; Costs; Delay; Hardware;
Multiprocessing systems; Personal communication
networks; Runtime; Timing; Watches",
}
@Article{Kumar:2003:PPR,
author = "R. Kumar and K. Farkas and N. P. Jouppi and P.
Ranganathan and D. M. Tullsen",
title = "Processor Power Reduction Via Single-{ISA}
Heterogeneous Multi-Core Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "2--2",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes a single-ISA heterogeneous
multi-core architecture as a mechanism to reduce
processor power dissipation. It assumes a single chip
containing a diverse set of cores that target different
performance levels and consume different levels of
power. During an application's execution, system
software dynamically chooses the most appropriate core
to meet specific performance and power requirements. It
describes an example architecture with five cores of
varying performance and complexity. Initial results
demonstrate a five-fold reduction in energy at a cost
of only 25\% performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; chip multiprocessor; Computer
architecture; Computer science; Costs; Energy
consumption; Fans; low-power architecture; Packaging;
Power dissipation; Power engineering and energy; System
software",
}
@Article{Sendag:2003:ACE,
author = "R. Sendag and Peng-fei Chuang and D. J. Lilja",
title = "Address Correlation: Exceeding the Limits of
Locality",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "3--3",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We investigate a program phenomenon, Address
Correlation, which links addresses that reference the
same data. This work shows that different addresses
containing the same data can often be correlated at
run-time to eliminate a load miss or a partial hit. For
ten of the SPEC CPU2000 benchmarks, 57 to 99\% of all
L1 data cache load misses, and 4 to 85\% of all partial
hits, can be supplied from a correlated address already
found in the cache. Our source code-level analysis
shows that semantically equivalent information,
duplicated references, and frequent values are the
major causes of address correlations. We also show
that, on average, 68\% of the potential correlated
addresses that could supply data on a miss of an
address containing the same value can be correlated at
run time. These correlated addresses correspond to an
average of 62\% of all misses in the benchmark programs
tested.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Delay; Electronic mail; Hardware;
History; Microarchitecture; Object oriented modeling;
Out of order; Runtime; Tellurium",
}
@Article{Milenkovic:2003:SBT,
author = "A. Milenkovic and M. Milenkovic",
title = "Stream-Based Trace Compression",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "4--4",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Trace-driven simulation has long been used in both
processor and memory studies. The large size of traces
motivated different techniques for trace reduction.
These techniques often combine standard compression
algorithms with trace-specific solutions, taking into
account the tradeoff between reduction in the trace
size and simulation slowdown due to decompression. This
paper introduces SBC, a new algorithm for instruction
and data address trace compression based on instruction
streams. The proposed technique significantly reduces
trace size and simulation time, and it is orthogonal to
general compression algorithms. When combined with
gzip, SBC reduces the size of SPEC CPU2000 traces
94-71968 times.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Compression algorithms; Computational
modeling; Computer architecture; Computer simulation;
Data mining; Information analysis; instruction and
address trace; Instruments; Predictive models;
Redundancy; simulation; trace compression",
}
@Article{Zhang:2003:WHC,
author = "Chuanjun Zhang and F. Vahid and Jun Yang and W.
Walid",
title = "A Way-Halting Cache for Low-Energy High-Performance
Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "5--5",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We have designed a low power four-way set associative
cache that stores the four lowest-order bits of all way
stags into a fully associative memory, which we call
the halt tag array. The comparison of the halt tag
array with the desired tag occurs concurrently with the
address decoding that determines which tag and data
ways to read from. The halt tag array predetermines
most tags that cannot match due to their low-order four
bits mismatching. Further accesses to ways with known
mismatching tags are then halted, thus saving power.
Our halt tag array has the additional feature of using
static logic only, rather than dynamic logic used in
highly-associative caches, making our cache consumes
even less power. Our result shows55\% savings of memory
access related energy over a conventional four-way
set-associative cache. We show nearly 2x energy savings
compared with highly associative caches, while imposing
no performance overhead and only 2\% cache area over
head.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cams; Circuits; Computer science; Decoding; Design
engineering; Embedded computing; Logic arrays; Power
engineering and energy; Power engineering computing;
Switches",
}
@Article{Cohen:2003:EOP,
author = "A. Cohen and F. Finkelstein and A. Mendelson and R.
Ronen and D. Rudoy",
title = "On Estimating Optimal Performance of {CPU} Dynamic
Thermal Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "6--6",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper we focus on dynamic thermal management
(DTM) strategies that use dynamic voltage scaling
(DVS)for power control. We perform a theoretical
analysis targeted at estimating the optimal strategy,
and show two facts: (1) when there is a gap between the
initial and the limit temperatures, it is best to start
with a high (though not necessarily maximal)frequency
and decrease it exponentially until the limit
temperature is reached; (2) when being close to the
limit temperature, the best strategy is to stay there.
We use the patterns exhibited by the optimal strategy
in order to analyze some existing DTM techniques.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Costs; DTM; DVS; Energy management; Frequency
estimation; Microprocessors; optimal control; Pattern
analysis; Performance analysis; Temperature control;
Temperature sensors; Thermal management; Voltage
control",
}
@Article{Cristal:2003:CRC,
author = "A. Cristal and J. F. Martinez and J. Llosa and M.
Valero",
title = "A case for resource-conscious out-of-order
processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "7--7",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern out-of-order processors tolerate long-latency
memory operations by supporting a large number of
in-flight instructions. This is achieved in part
through proper sizing of critical resources, such as
register files or instruction queues. In light of the
increasing gap between processor speed and memory
latency, tolerating upcoming latencies in this way
would require impractical sizes of such critical
resources. To tackle this scalability problem, we make
a case for resource-conscious out-of-order processors.
We present quantitative evidence that critical
resources are increasingly underutilized in these
processors. We advocate that better use of such
resources should be a priority in future research in
processor architectures.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bars; checkpointing.; Computer aided instruction;
Delay; instruction-level parallelism; Laboratories;
memory latency; Optimal control; Out of order;
Out-of-order processor; Queueing analysis; Registers;
Resource management; resource utilization; Voltage
control",
}
@Article{Citron:2004:ELE,
author = "D. Citron",
title = "Exploiting Low Entropy to Reduce Wire Delay",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "1--1",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Wires shrink less efficiently than transistors.
Smaller dimensions increase relative delay and the
probability of crosstalk. Solutions to this problem
include adding additional latency with pipelining,
using ``fat wires'' at higher metal levels, and
advances in process and material technology. We propose
a stopgap solution to this problem by applying a decade
old technique called bus-expanding to the problem. By
exploiting low spatial and temporal entropy of data it
is possible to transfer m bits of data over a n-bit
wide bus in a single cycle (m > n ). High entropy data
will be routed directly over the bus while low entropy
data will be compacted using small lookup tables. A
table index will be transferred in the case of a
successful lookup, otherwise the full value will be
transferred in several cycles. Reducing the number of
wires per bus, enables the use of wider wires, which in
turn reduces the wire delay. Examination of projected
process technologies shows that by shrinking the number
of bits in a bus (64 > 48) instead of shrinking the
individual wires maintains a constant wire delay. Tests
on SPEC CPU2000 have shown that for the 64-bit buses
leading from the L1 caches to the processor core it is
possible to transfer all data types (addresses,
integers, instructions and floating-points) using
40-bits per bus on the average.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Area measurement; Compaction; Crosstalk; Delay;
Entropy; Materials science and technology; Pipeline
processing; Power measurement; Transistors; Wire",
}
@Article{Singh:2004:GAL,
author = "A. Singh and W. J. Dally and B. Towles and A. K.
Gupta",
title = "Globally Adaptive Load-Balanced Routing on Tori",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "2--2",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We introduce a new method of adaptive routing on k-ary
n-cubes, Globally Adaptive Load-Balance (GAL). GAL
makes global routing decisions using global
information. In contrast, most previous adaptive
routing algorithms make local routing decisions using
local information (typically channel queue depth). GAL
senses global congestion using segmented injection
queues to decide the directions to route in each
dimension. It further load balances the network by
routing in the selected directions adaptively. Using
global information, GAL achieves the performance
(latency and throughput) of minimal adaptive routing on
benign traffic patterns and performs as well as the
best obliviously load-balanced routing algorithm (GOAL)
on adversarial traffic.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Chaos; Delay; Nearest neighbor searches; Routing;
Stability; Switches; Telecommunication traffic;
Throughput; Tornadoes; Traffic control",
}
@Article{Gomez:2004:EFT,
author = "M. E. Gomez and J. Duato and J. Flich and P. Lopez and
A. Robles and N. A. Nordbotten and O. Lysne and T.
Skeie",
title = "An Efficient Fault-Tolerant Routing Methodology for
Meshes and Tori",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "3--3",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper we present a methodology to design
fault-tolerant routing algorithms for regular direct
interconnection networks. It supports fully adaptive
routing, does not degrade performance in the absence of
faults, and supports a reasonably large number of
faults without significantly degrading performance. The
methodology is mainly based on the selection of an
intermediate node (if needed) for each
source-destination pair. Packets are adaptively routed
to the intermediate node and, at this node, without
being ejected, they are adaptively forwarded to their
destinations. In order to allow deadlock-free minimal
adaptive routing, the methodology requires only one
additional virtual channel (for a total of three), even
for tori. Evaluation results for a 4 x 4 x 4 torus
network show that the methodology is 5-fault tolerant.
Indeed, for up to 14 link failures, the percentage of
fault combinations supported is higher than 99.96\%.
Additionally, network throughput degrades by less than
10\% when injecting three random link faults without
disabling any node. In contrast, a mechanism similar to
the one proposed in the BlueGene/L, that disables some
network planes, would strongly degrade network
throughput by 79\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; Circuit faults;
Degradation; Design methodology; Electronic mail; Fault
tolerance; Multiprocessor interconnection networks;
Routing; Switches; Throughput",
}
@Article{Stine:2004:CAR,
author = "J. M. Stine and N. P. Carter and J. Flich",
title = "Comparing Adaptive Routing and Dynamic Voltage Scaling
for Link Power Reduction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "4--4",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We compare techniques that dynamically scale the
voltage of individual network links to reduce power
consumption with an approach in which all links in the
network are set to the same voltage and adaptive
routing is used to distribute load across the network.
Our results show that adaptive routing with static
network link voltages outperforms dimension-order
routing with dynamic link voltages in all cases,
because the adaptive routing scheme can respond more
quickly to changes in network demand. Adaptive routing
with static link voltages also outperforms adaptive
routing with dynamic link voltages in many cases,
although dynamic link voltage scaling gives better
behavior as the demand on the network grows.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Clocks; Dynamic voltage scaling; Energy
consumption; Frequency; Network-on-a-chip; Routing;
Telecommunication traffic; Traffic control; Voltage
control",
}
@Article{Robatmili:2004:TSI,
author = "B. Robatmili and N. Yazdani and S. Sardashti and M.
Nourani",
title = "Thread-Sensitive Instruction Issue for {SMT}
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "5--5",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Simultaneous Multi Threading (SMT) is a processor
design method in which concurrent hardware threads
share processor resources like functional units and
memory. The scheduling complexity and performance of an
SMT processor depend on the topology used in the fetch
and issue stages. In this paper, we propose a thread
sensitive issue policy for a partitioned SMT processor
which is based on a thread metric. We propose the
number of ready-to-issue instructions of each thread as
priority metric. To evaluate our method, we have
developed a reconfigurable SMT-simulator on top of the
SimpleScalar Toolset. We simulated our modeled
processor under several workloads composed of SPEC
benchmarks. Experimental results show around 30\%
improvement compared to the conventional OLDEST\_FIRST
mixed topology issue policy. Additionally, the hardware
implementation of our architecture with this metric in
issue stage is quite simple.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Delay; Frequency; Intrusion detection;
Laboratories; Logic; Processor scheduling;
Surface-mount technology; Topology",
}
@Article{Luo:2004:EES,
author = "Yue Luo and L. K. John",
title = "Efficiently Evaluating Speedup Using Sampled Processor
Simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "6--6",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cycle accurate simulation of processors is extremely
time consuming. Sampling can greatly reduce simulation
time while retaining good accuracy. Previous research
on sampled simulation has been focusing on the accuracy
of CPI. However, most simulations are used to evaluate
the benefit of some microarchitectural enhancement, in
which the speedup is a more important metric than CPI.
We employ the ratio estimator from statistical sampling
theory to design efficient sampling to measure speedup
and to quantify its error. We show that to achieve a
given relative error limit for speedup, it is not
necessary to estimate CPI to the same accuracy. In our
experiment, estimating speedup requires about 9X fewer
instructions to be simulated in detail in comparison to
estimating CPI for the same relative error limit.
Therefore using the ratio estimator to evaluate speedup
is much more cost-effective and offers great potential
for reducing simulation time. We also discuss the
reason for this interesting and important result.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; Clocks; Computational modeling;
Computer errors; Computer simulation; Frequency;
Microarchitecture; Sampling methods; Size measurement;
Velocity measurement",
}
@Article{Ceze:2004:CHL,
author = "L. Ceze and K. Strauss and J. Tuck and J. Renau and J.
Torrellas",
title = "{CAVA}: Hiding {L2} Misses with Checkpoint-Assisted
Value Prediction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "7--7",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Load misses in on-chip L2 caches often end up stalling
modern superscalars. To address this problem, we
propose hiding L2 misses with Checkpoint-Assisted VAlue
prediction (CAVA). When a load misses in L2, a
predicted value is returned to the processor. If the
missing load reaches the head of the reorder buffer
before the requested data is received from memory, the
processor checkpoints, consumes the predicted value,
and speculatively continues execution. When the
requested data finally arrives, it is compared to the
predicted value. If the prediction was correct,
execution continues normally; otherwise, execution
rolls back to the checkpoint. Compared to a baseline
aggressive superscalar, CAVA speeds up execution by a
geometric mean of 1.14 for SPECint and 1.34 for SPECfp
applications. Additionally, CAVA is faster than an
implementation of Runahead execution, and Runahead with
value prediction.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; Checkpointing; Costs; Delay;
Hardware; Microarchitecture; Out of order; Pipelines;
Prefetching; Recycling",
}
@Article{Singh:2004:BDB,
author = "A. Singh and W. J. Dally",
title = "Buffer and Delay Bounds in High Radix Interconnection
Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "8--8",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We apply recent results in queueing theory to propose
a methodology for bounding the buffer depth and packet
delay in high radix interconnection networks. While
most work in interconnection networks has been focused
on the throughput and average latency in such systems,
few studies have been done providing statistical
guarantees for buffer depth and packet delays. These
parameters are key in the design and performance of a
network. We present a methodology for calculating such
bounds for a practical high radix network and through
extensive simulations show its effectiveness for both
bursty and non-bursty injection traffic. Our results
suggest that modest speedups and buffer depths enable
reliable networks without flow control to be
constructed.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Convergence; Delay; Intelligent networks;
Multiprocessor interconnection networks; Queueing
analysis; Supercomputers; Switches; Telecommunication
traffic; Throughput; Traffic control",
}
@Article{Holloway:2004:CPS,
author = "A. L. Holloway and G. S. Sohi",
title = "Characterization of Problem Stores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "9--9",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper introduces the concept of problem stores:
static stores whose dependent loads often miss in the
cache. Accurately identifying problem stores allows the
early determination of addresses likely to cause later
misses, potentially allowing for the development of
novel, proactive prefetching and memory hierarchy
management schemes. We present a detailed empirical
characterization of problem stores using the SPEC2000
CPU benchmarks. The data suggests several key
observations about problem stores. First, we find that
the number of important problem stores is typically
quite small; the worst 100 problem stores write the
values that will lead to about 90\% of non-cold misses
for a variety of cache configurations. We also find
that problem stores only account for 1 in 8 dynamic
stores, though they result in 9 of 10 misses.
Additionally, the problem stores dependent loads miss
in the L2 cache a larger fraction of the time than
loads not dependent on problem stores. We also observe
the set of problem stores is stable across a variety of
cache configurations. Finally, we found that the
instruction distance from problem store to miss and
problem store to evict is often greater than one
million instructions, but the value is often needed
within 100,000 instructions of the eviction.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Delay; Hardware; Memory management; Prefetching;
Proposals; Timing",
}
@Article{Sazeides:2005:DIB,
author = "Y. Sazeides and R. Kumar and D. M. Tullsen and T.
Constantinou",
title = "The Danger of Interval-Based Power Efficiency Metrics:
When Worst Is Best",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "4",
number = "1",
pages = "1--1",
month = jan,
year = "2005",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2005.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper shows that if the execution of a program is
divided into distinct intervals, it is possible for one
processor or configuration to provide the best power
efficiency over every interval, and yet have worse
overall power efficiency over the entire execution than
other configurations. This unintuitive behavior is a
result of a seemingly intuitive use of power efficiency
metrics, and can result in suboptimal design and
execution decisions. This behavior may occur when using
the energy-delay product and energy-delay product
metrics but not with the energy metric.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Battery charge measurement; Clocks; Computer science;
Delay; Design optimization; Frequency; Out of order;
Power engineering and energy; Power measurement",
}
@Article{Mutlu:2005:RRP,
author = "O. Mutlu and Hyesoon Kim and J. Stark and Y. N. Patt",
title = "On Reusing the Results of Pre-Executed Instructions in
a Runahead Execution Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "4",
number = "1",
pages = "2--2",
month = jan,
year = "2005",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2005.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Previous research on runahead execution took it for
granted as a prefetch-only technique. Even though the
results of instructions independent of an L2 miss are
correctly computed during runahead mode, previous
approaches discarded those results instead of trying to
utilize them in normal mode execution. This paper
evaluates the effect of reusing the results of
preexecuted instructions on performance. We find that,
even with an ideal scheme, it is not worthwhile to
reuse the results of preexecuted instructions. Our
analysis provides insights into why result reuse does
not provide significant performance improvement in
runahead processors and concludes that runahead
execution should be employed as a prefetching mechanism
rather than a full-blown prefetching/result-reuse
mechanism.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computational modeling; Computer aided
instruction; Delay; Energy consumption;
Microprocessors; Performance analysis; Prefetching;
Registers",
}
@Article{Zhang:2006:BIC,
author = "Chuanjun Zhang",
title = "Balanced instruction cache: reducing conflict misses
of direct-mapped caches through balanced subarray
accesses",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "2--5",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is observed that the limited memory space of
direct-mapped caches is not used in balance therefore
incurs extra conflict misses. We propose a novel cache
organization of a balanced cache, which balances
accesses to cache sets at the granularity of cache
subarrays. The key technique of the balanced cache is a
programmable subarray decoder through which the mapping
of memory reference addresses to cache subarrays can be
optimized hence conflict misses of direct-mapped caches
can be resolved. The experimental results show that the
miss rate of balanced cache is lower than that of the
same sized two-way set-associative caches on average
and can be as low as that of the same sized four-way
set-associative caches for particular applications.
Compared with previous techniques, the balanced cache
requires only one cycle to access all cache hits and
has the same access time as direct-mapped caches",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "balanced instruction cache; balanced subarray
accesses; Bridges; Cache memory; cache organization;
cache storage; Clocks; conflict miss reduction;
Decoding; Delay; Frequency; High performance computing;
programmable subarray decoder; storage allocation",
}
@Article{Ottoni:2006:SPC,
author = "G. Ottoni and R. Rangan and A. Stoler and M. J.
Bridges and D. I. August",
title = "From sequential programs to concurrent threads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "6--9",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Chip multiprocessors are of increasing importance due
to difficulties in achieving higher clock frequencies
in uniprocessors, but their success depends on finding
useful work for the processor cores. This paper
addresses this challenge by presenting a simple
compiler approach that extracts non-speculative
thread-level parallelism from sequential codes. We
present initial results from this technique targeting a
validated dual-core processor model, achieving speedups
ranging from 9-48\% with an average of 25\% for
important benchmark loops over their single-threaded
versions. We also identify important next steps found
during our pursuit of higher degrees of automatic
threading",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "automatic threading; Bridges; Clocks; Computer
science; concurrency control; concurrent threads;
Frequency; Hardware; Microprocessors; multi-threading;
nonspeculative thread-level parallelism; Parallel
processing; Pipeline processing; program compiler;
program compilers; Program processors; sequential
programs",
}
@Article{Gupta:2006:TOI,
author = "A. K. Gupta and W. J. Dally",
title = "Topology optimization of interconnection networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "10--13",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes an automatic optimization tool
that searches a family of network topologies to select
the topology that best achieves a specified set of
design goals while satisfying specified packaging
constraints. Our tool uses a model of signaling
technology that relates bandwidth, cost and distance of
links. This model captures the distance-dependent
bandwidth of modern high-speed electrical links and the
cost differential between electrical and optical links.
Using our optimization tool, we explore the design
space of hybrid Clos-torus (C-T) networks. For a
representative set of packaging constraints we
determine the optimal hybrid C-T topology to minimize
cost and the optimal C-T topology to minimize latency
for various packet lengths. We then use the tool to
measure the sensitivity of the optimal topology to
several important packaging constraints such as pin
count and critical distance",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Constraint optimization; Costs; Design
optimization; hybrid Clos-torus networks;
interconnection networks; Multiprocessor
interconnection networks; multistage interconnection
networks; Network topology; Optical fiber
communication; Packaging; signaling technology;
signalling; Space exploration; Space technology;
telecommunication network topology; topology
optimization tool",
}
@Article{Gaudiot:2006:F,
author = "J.-L. Gaudiot and Y. Patt and K. Skadon",
title = "Foreword",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "11--11",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Forward for issue 1 of 2006",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Computer Society; Concrete;
Delay; Footwear; Software libraries; Vehicles",
}
@Article{Morad:2006:PPE,
author = "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M.
Valero and E. Ayguade",
title = "Performance, power efficiency and scalability of
asymmetric cluster chip multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "14--17",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper evaluates asymmetric cluster chip
multiprocessor (ACCMP) architectures as a mechanism to
achieve the highest performance for a given power
budget. ACCMPs execute serial phases of multithreaded
programs on large high-performance cores whereas
parallel phases are executed on a mix of large and many
small simple cores. Theoretical analysis reveals a
performance upper bound for symmetric multiprocessors,
which is surpassed by asymmetric configurations at
certain power ranges. Our emulations show that
asymmetric multiprocessors can reduce power consumption
by more than two thirds with similar performance
compared to symmetric multiprocessors",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACCMP; Application software; asymmetric cluster chip
multiprocessors; Chip Multiprocessors; Emulation;
Frequency; microprocessor chips; multi-threading;
multiprocessing systems; multithreaded program;
Optimized production technology; Parallel processing;
parallel processing; power consumption reduction; power
efficiency; Power Efficiency; Power system modeling;
Queueing analysis; Scalability; Upper bound; Voltage",
}
@Article{Riley:2006:PCU,
author = "N. Riley and C. Zilles",
title = "Probabilistic counter updates for predictor hysteresis
and bias",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "18--21",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware predictor designers have incorporated
hysteresis and/or bias to achieve desired behavior by
increasing the number of bits per counter. Some
resulting proposed predictor designs are currently
impractical because their counter tables are too large.
We describe a method for dramatically reducing the
amount of storage required for a predictor's counter
table with minimal impact on prediction accuracy.
Probabilistic updates to counter state are implemented
using a hardware pseudo-random number generator to
increment or decrement counters a fraction of the time,
meaning fewer counter bits are required. We demonstrate
the effectiveness of probabilistic updates in the
context of Fields et al.'s critical path predictor,
which employs a biased 6-bit counter. Averaged across
the SPEC CINT2000 benchmarks, our 2-bit and 3-bit
probabilistic counters closely approximate a 6-bit
deterministic one (achieving speedups of 7.75\% and
7.91\% compared to 7.94\%) when used for
criticality-based scheduling in a clustered machine.
Performance degrades gracefully, enabling even a 1-bit
probabilistic counter to outperform the best 3-bit
deterministic counter we found",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; clustered machine; computer architecture;
Computer science; Costs; Counting circuits; critical
path predictor; criticality-based scheduling;
Degradation; Hardware; hardware predictor design;
hardware pseudorandom number generator; Hysteresis;
Microarchitecture; Pipelines; predictor bias; predictor
hysteresis; predictors counter table; probabilistic
counter update; probability; Processor scheduling;
processor scheduling; random number generation",
}
@Article{Zhou:2006:CFT,
author = "Huiyang Zhou",
title = "A case for fault tolerance and performance enhancement
using chip multi-processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "22--25",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper makes a case for using multi-core
processors to simultaneously achieve transient-fault
tolerance and performance enhancement. Our approach is
extended from a recent latency-tolerance proposal,
dual-core execution (DCE). In DCE, a program is
executed twice in two processors, named the front and
back processors. The front processor pre-processes
instructions in a very fast yet highly accurate way and
the back processor re-executes the instruction stream
retired from the front processor. The front processor
runs faster as it has no correctness constraints
whereas its results, including timely prefetching and
prompt branch misprediction resolution, help the back
processor make faster progress. In this paper, we
propose to entrust the speculative results of the front
processor and use them to check the un-speculative
results of the back processor. A discrepancy, either
due to a transient fault or a mispeculation, is then
handled with the existing mispeculation recovery
mechanism. In this way, both transient-fault tolerance
and performance improvement can be delivered
simultaneously with little hardware overhead",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "back processor; chip multiprocessors; Computer aided
software engineering; dual-core execution; Error
analysis; Fault tolerance; fault tolerant computing;
front processor; Hardware; latency-tolerance proposal;
microprocessor chips; mispeculation recovery mechanism;
Multicore processing; multiprocessing systems;
prefetching; Prefetching; prompt branch misprediction
resolution; Proposals; Redundancy; storage management;
Throughput; transient-fault tolerance; Transistors",
}
@Article{Lee:2006:ASC,
author = "Moon-Sang Lee and Sang-Kwon Lee and Joonwon Lee and
Seung-Ryoul Maeng",
title = "Adopting system call based address translation into
user-level communication",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "26--29",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "User-level communication alleviates the software
overhead of the communication subsystem by allowing
applications to access the network interface directly.
For that purpose, efficient address translation of
virtual address to physical address is critical. In
this study, we propose a system call based address
translation scheme where every translation is done by
the kernel instead of a translation cache on a network
interface controller as in the previous cache based
address translation. According to our experiments, our
scheme achieves up to 4.5\% reduction in application
execution time compared to the previous cache based
approach",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; cache based approach; cache
storage; Communication system software; Control
systems; Costs; Delay; Electronic mail; Hardware;
Kernel; network interface controller; network
interfaces; Network interfaces; operating system
kernels; Protocols; software overhead; system call
based address translation; user-level communication",
}
@Article{Ahn:2006:DPA,
author = "Jung Ho Ahn and W. J. Dally",
title = "Data parallel address architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "30--33",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Data parallel memory systems must maintain a large
number of outstanding memory references to fully use
increasing DRAM bandwidth in the presence of increasing
latency. At the same time, the throughput of modern
DRAMs is very sensitive to access pattern's due to the
time required to precharge and activate banks and to
switch between read and write access. To achieve memory
reference parallelism a system may simultaneously issue
references from multiple reference threads.
Alternatively multiple references from a single thread
can be issued in parallel. In this paper, we examine
this tradeoff and show that allowing only a single
thread to access DRAM at any given time significantly
improves performance by increasing the locality of the
reference stream and hence reducing precharge/activate
operations and read/write turnaround. Simulations of
scientific and multimedia applications show that
generating multiple references from a single thread
gives, on average, 17\% better performance than
generating references from two parallel threads",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer architecture; data parallel
address architecture; data parallel memory systems;
Delay; DRAM bandwidth; DRAM chips; Memory management;
parallel architectures; parallel memories; Parallel
processing; Random access memory; read access;
Scheduling; Streaming media; Switches; write access",
}
@Article{Eisley:2006:NCC,
author = "N. Eisley and Li-Shiuan Peh and Li Shang",
title = "In-network cache coherence",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "34--37",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We propose implementing cache coherence protocols
within the network, demonstrating how an in-network
implementation of the MSI directory-based protocol
allows for in-transit optimizations of read and write
delay. Our results show 15\% and 24\% savings on
average in memory access latency for SPLASH-2 parallel
benchmarks running on a 4times4 and a 16times16
multiprocessor respectively",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Access protocols; benchmark testing; cache coherence;
cache storage; Coherence; Delay; delays; Fabrics;
interconnection network; memory access latency; Memory
architecture; memory architecture; memory protocols;
Moore's Law; MSI directory-based protocol;
Multiprocessor interconnection networks; network cache
coherence protocols; parallel processing; read delay;
SPLASH-2 parallel benchmarks; write delay",
}
@Article{Srinivasan:2006:PMU,
author = "R. Srinivasan and J. Cook and O. Lubeck",
title = "Performance modeling using {Monte Carlo} simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "38--41",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/intel-ia-64.bib",
abstract = "Cycle accurate simulation has long been the primary
tool for micro-architecture design and evaluation.
Though accurate, the slow speed often imposes
constraints on the extent of design exploration. In
this work, we propose a fast, accurate Monte-Carlo
based model for predicting processor performance. We
apply this technique to predict the CPI of in-order
architectures and validate it against the Itanium-2.
The Monte Carlo model uses micro-architecture
independent application characteristics, and cache,
branch predictor statistics to predict CPI with an
average error of less than 7\%. Since prediction is
achieved in a few seconds, the model can be used for
fast design space exploration that can efficiently cull
the space for cycle-accurate simulations. Besides
accurately predicting CPI, the model also breaks down
CPI into various components, where each component
quantifies the effect of a particular stall condition
(branch misprediction, cache miss, etc.) on overall
CPI. Such a CPI decomposition can help processor
designers quickly identify and resolve critical
performance bottlenecks",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "branch predictor statistics; Computational modeling;
Computer architecture; CPI decomposition; design space
exploration; Error analysis; Itanium-2; Laboratories;
Mathematical analysis; memory architecture;
microarchitecture design; microarchitecture evaluation;
Monte Carlo methods; Monte Carlo simulation;
performance evaluation; Predictive models; Process
design; processor performance modeling; program
processors; Sampling methods; Space exploration",
}
@Article{Ergin:2006:ENV,
author = "O. Ergin and O. Unsal and X. Vera and A. Gonzalez",
title = "Exploiting Narrow Values for Soft Error Tolerance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "12--12",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Soft errors are an important challenge in contemporary
microprocessors. Particle hits on the components of a
processor are expected to create an increasing number
of transient errors with each new microprocessor
generation. In this paper we propose simple mechanisms
that effectively reduce the vulnerability to soft
errors In a processor. Our designs are generally
motivated by the fact that many of the produced and
consumed values in the processors are narrow and their
upper order bits are meaningless. Soft errors canted by
any particle strike to these higher order bits can be
avoided by simply identifying these narrow values.
Alternatively soft errors can be detected or corrected
on the narrow values by replicating the vulnerable
portion of the value inside the storage space provided
for the upper order bits of these operands. We offer a
variety of schemes that make use of narrow values and
analyze their efficiency in reducing soft error
vulnerability of level-1 data cache of the processor",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Cache storage; contemporary
microprocessors; data cache; Data Cache; Error
correction; error correction; Error Correction; error
correction; error detection; Hardware; Impurities;
Manufacturing; microprocessor chips; Microprocessors;
Multithreading; Narrow Values; narrow values; Neutrons;
particle strike; Process design; radiation effects;
Random access memory; soft error tolerance; Soft
Errors; system recovery; transient errors; transients",
}
@Article{Li:2006:PBH,
author = "W. Li and S. Mohanty and K. Kavi",
title = "A Page-based Hybrid (Software--Hardware) Dynamic
Memory Allocator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "13--13",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2000.bib",
abstract = "Modern programming languages often include complex
mechanisms for dynamic memory allocation and garbage
collection. These features drive the need for more
efficient implementation of memory management
functions, both in terms of memory usage and execution
performance. In this paper, we introduce a software and
hardware co-design to improve the speed of the software
allocator used in free-BSD systems. The hardware
complexity of our design is independent of the dynamic
memory size, thus making the allocator suitable for any
memory size. Our design improves the performance of
memory management intensive benchmarks by as much as
43\%. To oar knowledge, this is the first-ever work of
this kind, introducing ``hybrid memory allocator''",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; Computer languages; Computer
science; Costs; Delay; Dynamic programming; garbage
collection; Hardware; hardware complexity;
hardware-software codesign; hybrid dynamic memory
allocator; Java; memory allocator; memory architecture;
memory management; Memory management; modern
programming languages; software allocator; Software
performance; software-hardware co-design;
software/hardware co-design; storage allocation;
storage management",
}
@Article{Donald:2006:EPP,
author = "J. Donald and M. Martonosi",
title = "An Efficient, Practical Parallelization Methodology
for Multicore Architecture Simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "14--14",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multiple core designs have become commonplace in the
processor market, and are hence a major focus in modern
computer architecture research. Thus, for both product
development and research, multiple core processor
simulation environments are necessary. A well-known
positive feedback property of computer design is that
we use today's computers to design tomorrow's. Thus,
with the emergence of chip multiprocessors, it is
natural to re-examine simulation environments written
to exploit parallelism. In this paper we present a
programming methodology for directly converting
existing uniprocessor simulators into parallelized
multiple-core simulators. Our method not only takes
significantly less development effort compared to some
prior used programming techniques, but also possesses
advantages by retaining a modular and comprehensible
programming structure. We demonstrate our case with
actual developed products after applying this method to
two different simulators, one developed from IBM
Ibrandot and the other from the SimpleScalar tool set.
Our SimpleScalar-based framework achieves a parallel
speedup of 2.2times on a dual-CPU dual-core (4-way)
Opteron server",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "chip multiprocessors; comprehensible programming
structure; Computational modeling; Computer
architecture; Computer simulation; Feedback; IBM
Ibrandot; logic simulation; microcomputers; modern
computer architecture; modular programming structure;
multicore; multicore architecture simulation; Multicore
processing; multiple core processor simulation;
multiprocessing systems; Object oriented modeling;
parallel architectures; Parallel processing; Parallel
programming; parallelism; parallelization method;
parallelized multiple-core simulators; positive
feedback property; Process planning; Product
development; programming methodology; SimpleScalar tool
set; simulation",
}
@Article{Bracy:2006:DAC,
author = "A. Bracy and K. Doshi and Q. Jacobson",
title = "Disintermediated Active Communication",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "15--15",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Disintermediated active communication (DAC) is a new
paradigm of communication in which a sending thread
actively engages a receiving thread when sending it a
message via shared memory. DAC is different than
existing approaches that use passive communication
through shared-memory --- based on intermittently
checking for messages --- or that use preemptive
communication but must rely on intermediaries such as
the operating system or dedicated interrupt channels.
An implementation of DAC builds on existing cache
coherency support and exploits light-weight user-level
interrupts. Inter-thread communication occurs via
monitored memory locations where the receiver thread
responds to invalidations of monitored addresses with a
light-weight user-level software-defined handler.
Address monitoring is supported by cache line
user-bits, or CLUbits. CLUbits reside in the cache next
to the coherence state, are private per thread, and
maintain user-defined per-cache-line state. A light
weight software library can demultiplex asynchronous
notifications and handle exceptional cases. In
DAC-based programs threads coordinate with one another
by explicit signaling and implicit resource monitoring.
With the simple and direct communication primitives of
DAC, multi-threaded workloads synchronize at a finer
granularity and more efficiently utilize the hardware
of upcoming multi-core designs. This paper introduces
DAC, presents several signaling models for DAC-based
programs, and describes a simple memory-based framework
that supports DAC by leveraging existing
cache-coherency models. Our framework is general enough
to support uses beyond DAC",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address monitoring; cache coherency; cache line
user-bits; cache storage; CLUbits; Computer aided
instruction; Concurrent computing; disintermediated
active communication; Hardware; High performance
computing; interrupts; interthread communication;
memory locations; Monitoring; multi-threading;
multicore designs; Operating systems; Processor
scheduling; Programming profession; resource
monitoring; shared memory; shared memory systems;
signaling models; software libraries; Software
libraries; software library; storage allocation;
user-level interrupts",
}
@Article{Mallik:2006:UDF,
author = "A. Mallik and B. Lin and G. Memik and P. Dinda and R.
P. Dick",
title = "User-Driven Frequency Scaling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "16--16",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We propose and evaluate user-driven frequency scaling
(UDFS) for improved power management on processors that
support dynamic voltage and frequency scaling (DVFS),
e.g, those used in current laptop and desktop
computers. UDFS dynamically adapts CPU frequency to the
individual user and the workload through a simple user
feedback mechanism, unlike currently-used DVFS methods
which rely only on CPU utilization. Our UDFS algorithms
dramatically reduce typical operating frequencies while
maintaining performance at satisfactory levels for each
user. We evaluated our techniques through user studies
conducted on a Pentium M laptop running Windows
applications. The UDFS scheme reduces measured system
power by 22.1\%, averaged across all our users and
applications, compared to the Windows XP DVFS scheme",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Central Processing Unit; computer power supplies; CPU
frequency; DVFS; dynamic frequency scaling; Dynamic
voltage scaling; dynamic voltage scaling; Energy
consumption; Energy management; Engineering management;
Feedback; Frequency control; improved power management;
microprocessor chips; Pentium M laptop; Portable
computers; power aware computing; Power engineering
computing; Power Management; Power measurement; user
feedback mechanism; User-aware computing; user-driven
frequency scaling; Windows XP DVFS scheme",
}
@Article{Blundell:2006:STM,
author = "C. Blundell and E. C. Lewis and M. M. K. Martin",
title = "Subtleties of transactional memory atomicity
semantics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "17--17",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Transactional memory has great potential for
simplifying multithreaded programming by allowing
programmers to specify regions of the program that must
appear to execute atomically. Transactional memory
implementations then optimistically execute these
transactions concurrently to obtain high performance.
This work shows that the same atomic guarantees that
give transactions their power also have unexpected and
potentially serious negative effects on programs that
were written assuming narrower scopes of atomicity. We
make four contributions: (1) we show that a direct
translation of lock-based critical sections into
transactions can introduce deadlock into otherwise
correct programs, (2) we introduce the terms strong
atomicity and weak atomicity to describe the
interaction of transactional and non-transactional
code, (3) we show that code that is correct under weak
atomicity can deadlock under strong atomicity, and (4)
we demonstrate that sequentially composing
transactional code can also introduce deadlocks. These
observations invalidate the intuition that transactions
are strictly safer than lock-based critical sections,
that strong atomicity is strictly safer than weak
atomicity, and that transactions are always
composable",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer languages; Computer Systems Organization;
Concurrent distributed and parallel languages;
deadlock; direct translation; Hardware; Information
science; Interference; Interleaved codes; Language
Classifications; Law; lock-based critical sections;
Multi-core/single-chip multiprocessors;
multi-threading; Multiple Data Stream Architectures
(Multiprocessors); multithreaded programming;
nontransactional code; operating systems (computers);
Parallel Architectures; Processor Architectures;
program verification; Programming Languages;
Programming profession; sequentially composing
transactional code; Software performance;
Software/Software Engineering; strong atomicity; System
recovery; Transaction databases; transaction
processing; transactional memory atomicity semantics;
weak atomicity",
}
@Article{Price:2006:CCT,
author = "G. Price and M. Vachharajani",
title = "A Case for Compressing Traces with {BDDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "18--18",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Instruction-level traces are widely used for program
and hardware analysis. However, program traces for just
a few seconds of execution are enormous, up to several
terabytes in size, uncompressed. Specialized
compression can shrink traces to a few gigabytes, but
trace analyzers typically stream the decompressed trace
through the analysis engine. Thus, the complexity of
analysis depends on the decompressed trace size (even
though the decompressed trace is never stored to disk).
This makes many global or interactive analyses
infeasible. This paper presents a method to compress
program traces using binary decision diagrams (BDDs).
BDDs intrinsically support operations common to many
desirable program analyses and these analyses operate
directly on the BDD. Thus, they are often polynomial in
the size of the compressed representation. The paper
presents mechanisms to represent a variety of trace
data using BDDs and shows that BDDs can store, in 1 GB
of RAM, the entire data-dependence graph of traces with
over 1 billion instructions. This allows rapid
computation of global analyses such as heap-object
liveness and dynamic slicing",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "binary decision diagrams; Binary decision diagrams;
Boolean functions; Data analysis; Data structures;
data-dependence graph; dynamic slicing; Engines; global
analyses; Hardware; hardware analysis; heap-object
liveness; instruction-level traces; Performance
analysis; Polynomials; program analysis; program
slicing; program traces; rapid computation; Read-write
memory; Software Engineering; Software Processor
validation Engineering; Software/Program Verification;
Software/Software; Software/Software Engineering;
specialized compression; Testing and Debugging; trace
analyzers; traces compression; Tracing; Validation;
Visualization",
}
@Article{MoretoPlanas:2007:EDC,
author = "M. {Moreto Planas} and F. Cazorla and A. Ramirez and
M. Valero",
title = "Explaining Dynamic Cache Partitioning Speed Ups",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "1--4",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cache partitioning has been proposed as an interesting
alternative to traditional eviction policies of shared
cache levels in modern CMP architectures: throughput is
improved at the expense of a reasonable cost. However,
these new policies present different behaviors
depending on the applications that are running in the
architecture. In this paper, we introduce some metrics
that characterize applications and allow us to give a
clear and simple model to explain final throughput
speed ups.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.b Cache memories; B.3.3 Performance
Analysis and Design Aids; C Computer Systems
Organization; C.1 Processor Architectures; C.1.4
Parallel Architectures; C.1.4.e Multi-core/single-chip
multiprocessors; C.1.5 Micro-architecture
implementation considerations; C.1.5.e Memory
hierarchy; C.4 Performance of Systems; C.4.d Modeling
techniques; cache storage; chip multiprocessing;
Computer architecture; Counting circuits; dynamic cache
partitioning; microprocessor chips; Parallel
processing; Process design; Resource management; shared
cache levels; Streaming media; Surface-mount
technology; Throughput; Uninterruptible power systems",
}
@Article{Jerger:2007:CSC,
author = "N. Enright Jerger and M. Lipasti and L. Peh",
title = "Circuit-Switched Coherence",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "5--8",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Circuit-switched networks can significantly lower the
communication latency between processor cores, when
compared to packet-switched networks, since once
circuits are set up, communication latency approaches
pure interconnect delay. However, if circuits are not
frequently reused, the long set up time and poorer
interconnect utilization can hurt overall performance.
To combat this problem, we propose a hybrid router
design which intermingles packet-switched flits with
circuit-switched flits. Additionally, we co-design a
prediction-based coherence protocol that leverages the
existence of circuits to optimize pair-wise sharing
between cores. The protocol allows pair-wise sharers to
communicate directly with each other via circuits and
drives up circuit reuse. Circuit-switched coherence
provides overall system performance improvements of up
to 17\% with an average improvement of 10\% and reduces
network latency by up to 30\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; C Computer Systems Organization; C.1
Processor Architectures; C.1.4 Parallel Architectures;
C.1.4.e Multi-core/single-chip multiprocessors; C.1.4.g
On-chip interconnection networks; C.1.5
Micro-architecture implementation considerations;
C.1.5.e Memory hierarchy; circuit switching;
circuit-switched network; Coupling circuits; Delay;
Fabrics; hybrid router design; Integrated circuit
interconnections; multiprocessor interconnection
networks; network latency; Network-on-a-chip; packet
switching; Packet switching; packet switching;
pair-wise sharing; Pipelines; prediction-based
coherence protocol; processor core; Protocols; routing
protocols; System performance",
}
@Article{Kodakara:2007:CRM,
author = "S. Kodakara and J. Kim and D. Lilja and D. Hawkins and
W. Hsu and P. Yew",
title = "{CIM}: a Reliable Metric for Evaluating Program Phase
Classifications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "9--12",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We propose the use of the confidence interval of
estimated mean (CIM), a metric based on statistical
sampling theory, to evaluate the quality of a given
phase classification and for comparing different phase
classification schemes. Previous research on phase
classification used the weighted average of coefficient
of variation (CoVwa) to estimate phase classification
quality. We found that the phase quality indicated by
CoVwa could be inconsistent across different phase
classifications. We explain the reasons behind this
inconsistency and demonstrate the inconsistency using
data from several SPEC CPU2000 benchmark programs. We
show that the confidence interval of estimated mean
(CIM) correctly estimates the quality of phase
classification with a meaningful statistical
interpretation.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Benchmark Analysis; Clustering
algorithms; Computer architecture; computer
architecture; Computer integrated manufacturing;
confidence interval; estimated mean; estimation theory;
pattern classification; Phase Classification; Phase
detection; Phase estimation; Phase measurement; phase
quality estimation; program compilers; program
diagnostics; program phase classification; Quality
Metric; reliable metric; Sampling methods; sampling
methods; SPEC CPU2000 benchmark program; statistical
interpretation; Statistical Sampling; statistical
sampling theory; Statistics; Surges",
}
@Article{Dieter:2007:LCM,
author = "W. R. Dieter and A. Kaveti and H. G. Dietz",
title = "Low-Cost Microarchitectural Support for Improved
Floating-Point Accuracy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "13--16",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Some processors designed for consumer applications,
such as graphics processing units (CPUs) and the CELL
processor, promise outstanding floating-point
performance for scientific applications at commodity
prices. However, IEEE single precision is the most
precise floating-point data type these processors
directly support in hardware. Pairs of native
floating-point numbers can be used to represent a base
result and a residual term to increase accuracy, but
the resulting order of magnitude slowdown dramatically
reduces the price/performance advantage of these
systems. By adding a few simple microarchitectural
features, acceptable accuracy can be obtained with
relatively little performance penalty. To reduce the
cost of native-pair arithmetic, a residual register is
used to hold information that would normally have been
discarded after each floating-point computation. The
residual register dramatically simplifies the code,
providing both lower latency and better
instruction-level parallelism.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; B Hardware; B.2 Arithmetic and
Logic Structures; B.2.4 High-Speed Arithmetic; B.2.4.b
Cost/performance; C Computer Systems Organization; C.0
General; C.0.b Hardware/software interfaces; C.1
Processor Architectures; C.1.5 Micro-architecture
implementation considerations; CELL processor; computer
architecture; Costs; floating point arithmetic;
floating-point accuracy; Floating-point arithmetic; G
Mathematics of Computing; G.1 Numerical Analysis; G.1.0
General; G.1.0.e Multiple precision arithmetic;
Graphics; graphics processing units; Hardware; I
Computing Methodologies; I.3 Computer Graphics; I.3.1
Hardware Architecture; I.3.1.a Graphics processors;
IEEE single precision; instruction-level parallelism;
microarchitectural support; Microarchitecture; parallel
processing; Pipelines; Registers; Software algorithms;
Software performance",
}
@Article{Etsion:2007:PPT,
author = "Y. Etsion and D. G. Feitelson",
title = "Probabilistic Prediction of Temporal Locality",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "17--20",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The increasing gap between processor and memory
speeds, as well as the introduction of multi-core CPUs,
have exacerbated the dependency of CPU performance on
the memory subsystem. This trend motivates the search
for more efficient caching mechanisms, enabling both
faster service of frequently used blocks and decreased
power consumption. In this paper we describe a novel,
random sampling based predictor that can distinguish
transient cache insertions from non-transient ones. We
show that this predictor can identify a small set of
data cache resident blocks that service most of the
memory references, thus serving as a building block for
new cache designs and block replacement policies.
Although we only discuss the L1 data cache, we have
found this predictor to be efficient also when handling
L1 instruction caches and shared L2 caches.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.b Cache memories; B.3.3 Performance
Analysis and Design Aids; cache storage; Computer
science; Data analysis; data cache; Distributed
computing; Energy consumption; Extraterrestrial
phenomena; memory subsystem; multi-core CPU; power
aware computing; probabilistic prediction; random
sampling; Sampling methods; temporal locality;
transient cache insertions; Visualization",
}
@Article{Guz:2007:NCO,
author = "Z. Guz and I. Keidar and A. Kolodny and U. Weiser",
title = "{Nahalal}: Cache Organization for Chip
Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "21--24",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper addresses cache organization in chip
multiprocessors (CMPs). We show that in CMP systems it
is valuable to distinguish between shared data, which
is accessed by multiple cores, and private data
accessed by a single core. We introduce Nahalal, an
architecture whose novel floorplan topology partitions
cached data according to its usage (shared versus
private data), and thus enables fast access to shared
data for all processors while preserving the vicinity
of private data to each processor. Nahalal exhibits
significant improvements in cache access latency
compared to a traditional cache design.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Cache memories; cache organization; cache
storage; chip multiprocessors; circuit layout; CMP
systems; Computer integrated manufacturing; Computer
Systems Organization; Design Styles; floorplan topology
partitions; Hardware; Memory Structures; microprocessor
chips; Multi-core/single-chip multiprocessors; Nahalal;
Parallel Architectures; Processor Architectures;
Writing",
}
@Article{Joao:2007:DPI,
author = "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
title = "Dynamic Predication of Indirect Jumps",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "25--28",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Indirect jumps are used to implement
increasingly-common programming language constructs
such as virtual function calls, switch-case statements,
jump tables, and interface calls. Unfortunately, the
prediction accuracy of indirect jumps has remained low
because many indirect jumps have multiple targets that
are difficult to predict even with specialized
hardware. This paper proposes a new way of handling
hard-to-predict indirect jumps: dynamically predicating
them. The compiler identifies indirect jumps that are
suitable for predication along with their control-flow
merge (CFM) points. The microarchitecture predicates
the instructions between different targets of the jump
and its CFM point if the jump turns out to be
hard-to-predict at run time. We describe the new
indirect jump predication architecture, provide code
examples showing why it could reduce the performance
impact of jumps, derive an analytical cost-benefit
model for deciding which jumps and targets to
predicate, and present preliminary evaluation
results.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Analytical models; and statically-scheduled
implementation; Computer languages; Computer Systems
Organization; control-flow merge point;
dynamically-scheduled; dynamically-scheduled and
statically-scheduled implementation; hard-to-predict
indirect jump handling; Hardware; Instruction fetch;
Instruction sets; interface call; jump table;
Micro-architecture implementation considerations;
Microarchitecture; microarchitecture dynamic
predication; Object oriented modeling; parallel
architectures; Performance analysis; Pipeline
processors; Pipelines; Processor Architectures; program
compiler; program compilers; program control
structures; programming language construct; Single Data
Stream Architectures; Superscalar; switch-case
statement; Switches; system monitoring; virtual
function call",
}
@Article{Das:2007:MMC,
author = "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
A. Choudhary",
title = "Microarchitectures for Managing Chip Revenues under
Process Variations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "29--32",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As transistor feature sizes continue to shrink into
the sub-90 nm range and beyond, the effects of process
variations on critical path delay and chip yields have
amplified. A common concept to remedy the effects of
variation is speed-binning, by which chips from a
single batch are rated by a discrete range of
frequencies and sold at different prices. In this
paper, we discuss strategies to modify the number of
chips in different bins and hence enhance the profits
obtained from them. Particularly, we propose a scheme
that introduces a small Substitute Cache associated
with each cache way to replicate the data elements that
will be stored in the high latency lines. Assuming a
fixed pricing model, this method increases the revenue
by as much as 13.8\% without any impact on the
performance of the chips.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cache Memories; cache memory; cache storage; Circuits;
Computer Architecture; computer architecture; Computer
Architecture; Computer architecture; critical path
delay; Fabrication; Fault-tolerant Computing.; fixed
pricing model; Frequency; Logic arrays;
Microarchitecture; microarchitecture chip;
microprocessor chips; Microprocessors; optimisation;
process variation; Process Variations; Registers; Size
control; Voltage control",
}
@Article{Zebchuk:2007:BBC,
author = "J. Zebchuk and A. Moshovos",
title = "A Building Block for Coarse-Grain Optimizations in the
On-Chip Memory Hierarchy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "33--36",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Current on-chip block-centric memory hierarchies
exploit access patterns at the fine-grain scale of
small blocks. Several recently proposed memory
hierarchy enhancements for coherence traffic reduction
and prefetching suggest that additional useful patterns
emerge with a macroscopic, coarse-grain view. This
paper presents RegionTracker, a dual-grain, on-chip
cache design that exposes coarse-grain behavior while
maintaining block-level communication. RegionTracker
eliminates the extraneous, often imprecise coarse-grain
tracking structures of previous proposals. It can be
used as the building block for coarse-grain
optimizations, reducing their overall cost and easing
their adoption. Using full-system simulation of a
quad-core chip multiprocessor and commercial workloads,
we demonstrate that RegionTracker overcomes the
inefficiencies of previous coarse-grain cache designs.
We also demonstrate how RegionTracker boosts the
benefits and reduces the cost of a previously proposed
snoop reduction technique.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access patterns; Bandwidth; cache storage; Cache
storage; coarse-grain optimizations; coherence traffic
reduction; Cost function; Design optimization;
Explosions; Information management; Memory management;
Multithreading; on-chip memory hierarchy; optimising
compilers; Prefetching; prefetching; Proposals;
quad-core chip multiprocessor; RegionTracker dual-grain
on-chip cache design; system-on-chip",
}
@Article{Kim:2007:FBT,
author = "J. Kim and J. Balfour and W. J. Dally",
title = "Flattened Butterfly Topology for On-Chip Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "37--40",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "With the trend towards increasing number of cores in a
multicore processors, the on-chip network that connects
the cores needs to scale efficiently. In this work, we
propose the use of high-radix networks in on-chip
networks and describe how the flattened butterfly
topology can be mapped to on-chip networks. By using
high-radix routers to reduce the diameter of the
network, the flattened butterfly offers lower latency
and energy consumption than conventional on-chip
topologies. In addition, by properly using bypass
channels in the flattened butterfly network,
non-minimal routing can be employed without increasing
latency or the energy consumption.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer networks; Delay; Energy
consumption; flattened butterfly; flattened butterfly
topology; high-radix networks; high-radix routers;
Laboratories; Multicore processing; multicore
processors; Multiprocessor interconnection networks;
Network topology; network topology; Network-on-a-chip;
network-on-chip; on-chip networks; Routing; topology",
}
@Article{Xiao:2007:NPD,
author = "X. Xiao and J. Lee",
title = "A Novel Parallel Deadlock Detection Algorithm and
Hardware for Multiprocessor System-on-a-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "41--44",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Given the projected dramatic increase in the number of
processors and resources in a system-on-a-chip, a
quadratic increase in the likelihood of deadlock is
predicted due to complex system behavior. To deal with
this issue, we here present a novel parallel
hardware-oriented deadlock detection algorithm with $
O(1) $ DEADLOCK DETECTION AND $ O(\MIN (M, N)) $
preparation, where $m$ and $n$ are the numbers of
processes and resources, respectively. Our
contributions are (i) the first $ O(1)$ deadlock
detection hardware implementation and (ii) a new
algorithmic method of achieving $ O(\min (m, n))$
overall run-time complexity. We implement our algorithm
in Verilog HDL and demonstrate that deadlock detection
always takes only two clock cycles regardless of the
size of a system (i.e., $m$ and $n$).",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithms implemented in hardware; computational
complexity; deadlock detection hardware; Deadlocks;
Detection algorithms; Hardware design languages;
microprocessor chips; Multiprocessing systems;
multiprocessing systems; multiprocessor
system-on-a-chip; operating systems (computers);
Parallel algorithms; parallel algorithms; parallel
deadlock detection algorithm; Processor scheduling;
Real time systems; Real-time and embedded systems;
Resource management; run-time complexity; Runtime;
Software performance; System recovery; system-on-chip",
}
@Article{August:2007:UOS,
author = "D. August and J. Chang and S. Girbal and D.
Gracia-Perez and G. Mouchard and D. A. Penry and O.
Temam and N. Vachharajani",
title = "{UNISIM}: an Open Simulation Environment and Library
for Complex Architecture Design and Collaborative
Development",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "45--48",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Simulator development is already a huge burden for
many academic and industry research groups; future
complex or heterogeneous multi-cores, as well as the
multiplicity of performance metrics and required
functionality, will make matters worse. We present a
new simulation environment, called UNISIM, which is
designed to rationalize simulator development by making
it possible and efficient to distribute the overall
effort over multiple research groups, even without
direct cooperation. UNISIM achieves this goal with a
combination of modular software development,
distributed communication protocols, multilevel
abstract modeling, interoperability capabilities, a set
of simulator services APIs, and an open
library/repository for providing a consistent set of
simulator modules.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "API; application program interfaces; Collaboration;
collaborative development; complex architecture design;
Computational modeling; Computer architecture; Computer
industry; Computer science; Design engineering;
distributed communication protocols; groupware;
interoperability capability; Libraries; Measurement;
modular software development; multilevel abstract
modeling; open library; open repository; open
simulation environment; open systems; Operating
systems; Performance and Reliability; Processor
Architectures; Programming; simulator development;
simulator modules; simulator services; software
architecture; UNISIM",
}
@Article{Sendag:2007:BMP,
author = "R. Sendag and J. Yi and P. Chuang",
title = "Branch Misprediction Prediction: Complementary Branch
Predictors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "49--52",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we propose a new class of branch
predictors, complementary branch predictors, which can
be easily added to any branch predictor to improve the
overall prediction accuracy. This mechanism differs
from conventional branch predictors in that it focuses
only on mispredicted branches. As a result, this
mechanism has the advantages of scalability and
flexibility (can be implemented with any branch
predictor), but is not on the critical path. More
specifically, this mechanism improves the branch
prediction accuracy by predicting which future branch
will be mispredicted next and when that will occur, and
then it changes the predicted direction at the
predicted time. Our results show that a branch
predictor with the branch misprediction predictor
achieves the same prediction accuracy as a conventional
branch predictor that is 4 to 16 times larger, but
without significantly increasing the overall complexity
or lengthening the critical path.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; branch misprediction prediction; branch
predictor; computational complexity; Computer networks;
Costs; Delay; Emerging technologies; History; parallel
architectures; Performance loss; Pipeline processors;
Pipelines; Prediction algorithms; Scalability;
Testing",
}
@Article{Yalcin:2007:UTM,
author = "G. Yalcin and O. Ergin",
title = "Using tag-match comparators for detecting soft
errors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "53--56",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Soft errors caused by high energy particle strikes are
becoming an increasingly important problem in
microprocessor design. With increasing transistor
density and die sizes, soft errors are expected to be a
larger problem in the near future. Recovering from
these unexpected faults may be possible by reexecuting
some part of the program only if the error can be
detected. Therefore it is important to come up with new
techniques to detect soft errors and increase the
number of errors that are detected. Modern
microprocessors employ out-of-order execution and
dynamic scheduling logic. Comparator circuits, which
are used to keep track of data dependencies, are
usually idle. In this paper, we propose various schemes
to exploit on-chip comparators to detect transient
faults. Our results show that around 50\% of the errors
on the wakeup logic can be detected with minimal
hardware overhead by using the proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and Fault-Tolerance; Broadcasting; Circuit faults;
comparators (circuits); Computer errors; Control
Structure Reliability; dynamic scheduling logic;
Electrical fault detection; Fault detection;
identification technology; Logic; logic design; logic
testing; microprocessor chips; microprocessor design;
Microprocessors; Out of order; out-of-order execution;
Pipelines; Processor Architectures; Registers;
scheduling; soft error detection; tag-match comparator;
Testing; Testing and Fault-Tolerance",
}
@Article{Joao:2008:DPI,
author = "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
title = "Dynamic Predication of Indirect Jumps",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "1--4",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Indirect jumps are used to implement increasingly
common programming language constructs such as virtual
function calls, switch-case statements, jump tables,
and interface calls. Unfortunately, the prediction
accuracy of indirect jumps has remained low because
many indirect jumps have multiple targets that are
difficult to predict even with specialized hardware.
This paper proposes a new way of handling
hard-to-predict indirect jumps: dynamically predicating
them. The compiler identifies indirect jumps that are
suitable for predication along with their control-flow
merge (CFM) points. The microarchitecture predicates
the instructions between different targets of the jump
and its CFM point if the jump turns out to be
hardto-predict at run time. We describe the new
indirect jump predication architecture, provide code
examples showing why it could reduce the performance
impact of jumps, derive an analytical cost-benefit
model for deciding which jumps and targets to
predicate, and present preliminary evaluation
results.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Analytical models; B Hardware; B.3 Memory
Structures; Cache memories; Computer languages;
Computer Systems Organization; Design Styles; Hardware;
Instruction sets; Microarchitecture;
Multi-core/single-chip multiprocessors; Object oriented
modeling; Parallel Architectures; Performance analysis;
Pipelines; Processor Architectures; Switches",
}
@Article{Das:2008:MMC,
author = "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
A. Choudhary",
title = "Microarchitectures for Managing Chip Revenues under
Process Variations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "5--8",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As transistor feature sizes continue to shrink into
the sub-90nm range and beyond, the effects of process
variations on critical path delay and chip yields have
amplified. A common concept to remedy the effects of
variation is speed-binning, by which chips from a
single batch are rated by a discrete range of
frequencies and sold at different prices. In this
paper, we discuss strategies to modify the number of
chips in different bins and hence enhance the profits
obtained from them. Particularly, we propose a scheme
that introduces a small substitute cache associated
with each cache way to replicate the data elements that
will be stored in the high latency lines. Assuming a
fixed pricing model, this method increases the revenue
by as much as 13.8\% without any impact on the
performance of the chips.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cache Memories; Computer Architecture; Computer
architecture; Cost function; Delay effects; Design
optimization; Fabrication; Fault-tolerant Computing.;
Frequency; Manufacturing; Microarchitecture; Pricing;
Process Variations; Transistors",
}
@Article{Roth:2008:PRR,
author = "A. Roth",
title = "Physical register reference counting",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "9--12",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Several proposed techniques including CPR (checkpoint
processing and recovery) and NoSQ (no store queue) rely
on reference counting to manage physical registers.
However, the register reference counting mechanism
itself has received surprisingly little attention. This
paper fills this gap by describing potential register
reference counting schemes for NoSQ, CPR, and a
hypothetical NoSQ/CPR hybrid. Although previously
described in terms of binary counters, we find that
reference counts are actually more naturally
represented as matrices. Binary representations can be
used as an optimization in specific situations.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and statically-scheduled implementation; binary
representations; checkpoint processing; checkpointing;
Counting circuits; dynamically-scheduled;
dynamically-scheduled and statically-scheduled
implementation; Engines; Information science; matrices;
Micro-architecture implementation considerations;
Microarchitecture; no store queue; physical register
reference counting; Physics computing; Proposals;
recovery technique; Registers; shift registers;
Superscalar",
}
@Article{Flich:2008:LBD,
author = "J. Flich and J. Duato",
title = "Logic-Based Distributed Routing for {NoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "13--16",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "The design of scalable and reliable interconnection
networks for multicore chips (NoCs) introduces new
design constraints like power consumption, area, and
ultra low latencies. Although 2D meshes are usually
proposed for NoCs, heterogeneous cores, manufacturing
defects, hard failures, and chip virtualization may
lead to irregular topologies. In this context,
efficient routing becomes a challenge. Although
switches can be easily configured to support most
routing algorithms and topologies by using routing
tables, this solution does not scale in terms of
latency and area. We propose a new circuit that removes
the need for using routing tables. The new mechanism,
referred to as logic-based distributed routing (LBDR),
enables the implementation in NoCs of many routing
algorithms for most of the practical topologies we
might find in the near future in a multicore chip. From
an initial topology and routing algorithm, a set of
three bits per switch output port is computed. By using
a small logic block, LHDR mimics (demonstrated by
evaluation) the behavior of routing algorithms
implemented with routing tables. This result is
achieved both in regular and irregular topologies.
Therefore, LBDR removes the need for using routing
tables for distributed routing, thus enabling flexible,
fast and power-efficient routing in NoCs.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "chip virtualization; circuit reliability; Circuit
topology; Delay; Energy consumption; heterogeneous
cores; interconnection network reliability;
interconnections; logic circuits; logic-based
distributed routing; Manufacturing; manufacturing
defects; Multi-core/single-chip multiprocessors;
Multicore processing; Multiprocessor interconnection
networks; network routing; network topology; Network
topology; Network-on-a-chip; network-on-chip; networks
for multicore chips; NoC; On-chip interconnection
networks; Routing; Switches",
}
@Article{Yoon:2008:CHP,
author = "J. H. Yoon and E. H. Nam and Y. J. Seong and H. Kim
and B. Kim and S. L. Min and Y. Cho",
title = "{Chameleon}: a High Performance Flash\slash {FRAM}
Hybrid Solid State Disk Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "17--20",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Flash memory solid state disk (SSD) is gaining
popularity and replacing hard disk drive (HDD) in
mobile computing systems such as ultra mobile PCs
(UMPCs) and notebook PCs because of lower power
consumption, faster random access, and higher shock
resistance. One of the key challenges in designing a
high-performance flash memory SSD is an efficient
handling of small random writes to non-volatile data
whose performance suffers from the inherent limitation
of flash memory that prohibits in-placc update. In this
paper, we propose a high performance Flash/FRAM hybrid
SSD architecture called Chameleon. In Chameleon,
metadata used by the flash translation layer (FTL), a
software layer in the flash memory SSD, is maintained
in a small FRAM since this metadata is a target of
intensive small random writes, whereas the bulk data is
kept in the flash memory. Performance evaluation based
on an FPGA implementation of the Chameleon architecture
shows that the use of FRAM in Chameleon improves the
performance by 21.3\%. The results also show that even
for bulk data that cannot be maintained in FRAM because
of the size limitation, the use of fine-grained write
buffering is critically important because of the
inability of flash memory to perform in-placc update of
data.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Chameleon; Computer architecture; Design studies; disc
drives; Energy consumption; Ferroelectric films; field
programmable gate arrays; flash memories; Flash memory;
flash memory solid state disk; flash translation layer;
flash-FRAM hybrid SSD architecture; FPGA
implementation; FTL; hard discs; hard disk drive; Hard
disks; HDD; Mass storage; memory architecture; Mobile
computing; mobile computing systems; Nonvolatile
memory; notebook PCs; Personal communication networks;
Random access memory; random-access storage; Solid
state circuits; SSD; ultra mobile PCs; UMPC",
}
@Article{Biswas:2008:CAA,
author = "A. Biswas and P. Racunas and J. Emer and S.
Mukherjee",
title = "Computing Accurate {AVFs} using {ACE} Analysis on
Performance Models: a Rebuttal",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "21--24",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "ACE (architecturally correct execution) analysis
computes AVFs (architectural vulnerability factors) of
hardware structures. AVF expresses the fraction of
radiation-induced transient faults that result in
user-visible errors. Architects usually perform this
analysis on a high-level performance model to quickly
compute per-structure AVFs. If, however, low-level
details of a microarchitecture are not modeled
appropriately, then their effects may not be reflected
in the per-structure AVFs. In this paper we refute
Wang, et al.'s (2007) claim that this detail is
difficult to model and imposes a practical threshold on
ACE analysis that forces its estimates to have a high
error margin. We show that carefully choosing a small
amount of additional detail can result in a much
tighter AVF bound than Wang, et al. were able to
achieve in their refined ACE analysis. Even the
inclusion of small details, such as read/write pointers
and appropriate inter-structure dependencies, can
increase the accuracy of the AVF computation by 40\% or
more. We argue that this is no different than modeling
the IPC (instructions per cycle) of a microprocessor
pipeline. A less detailed performance model will
provide less accurate IPCs. AVFs are no different.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and Fault-Tolerance; architectural vulnerability
factors; architecturally correct execution analysis;
Computational modeling; Hardware; hardware structures;
High performance computing; instructions per cycle;
inter-structure dependencies; Microarchitecture;
microprocessor pipeline; Microprocessors; Performance
analysis; Performance and Reliability; performance
evaluation; performance models; Pipelines; Protection;
radiation-induced transient faults; read pointers;
Reliability; Target tracking; Testing; Testing and
Fault-Tolerance; user-visible errors; write pointers",
}
@Article{Cho:2008:CAL,
author = "S. Cho and R. Melhem",
title = "Corollaries to {Amdahl's Law} for Energy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "25--28",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper studies the important interaction between
parallelization and energy consumption in a
parallelizable application. Given the ratio of serial
and parallel portion in an application and the number
of processors, we first derive the optimal frequencies
allocated to the serial and parallel regions in the
application to minimize the total energy consumption,
while the execution time is preserved (i.e., speedup =
1). We show that dynamic energy improvement due to
parallelization has a function rising faster with the
increasing number of processors than the speed
improvement function given by the well-known Amdahl's
Law. Furthermore, we determine the conditions under
which one can obtain both energy and speed improvement,
as well as the amount of improvement. The formulas we
obtain capture the fundamental relationship between
parallelization, speedup, and energy consumption and
can be directly utilized in energy aware processor
resource management. Our results form a basis for
several interesting research directions in the area of
power and energy aware parallel processing.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Amdahl's Law; Application software; Computer science;
Concurrent computing; dynamic energy improvement;
energy aware processor resource management; Energy
capture; energy consumption; Energy consumption; energy
consumption; Energy management; Equations; Hardware;
Parallel Architectures; parallel processing; Parallel
processing; parallelization; Power Management; Radio
spectrum management; Resource management",
}
@Article{Balfour:2008:EEP,
author = "J. Balfour and W. Dally and D. Black-Schaffer and V.
Parikh and J. Park",
title = "An Energy-Efficient Processor Architecture for
Embedded Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "29--32",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present an efficient programmable architecture for
compute-intensive embedded applications. The processor
architecture uses instruction registers to reduce the
cost of delivering instructions, and a hierarchical and
distributed data register organization to deliver data.
Instruction registers capture instruction reuse and
locality in inexpensive storage structures that arc
located near to the functional units. The data register
organization captures reuse and locality in different
levels of the hierarchy to reduce the cost of
delivering data. Exposed communication resources
eliminate pipeline registers and control logic, and
allow the compiler to schedule efficient instruction
and data movement. The architecture keeps a significant
fraction of instruction and data bandwidth local to the
functional units, which reduces the cost of supplying
instructions and data to large numbers of functional
units. This architecture achieves an energy efficiency
that is 23x greater than an embedded RISC processor.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Communication system control; compute-intensive
embedded applications; Computer applications; computer
architecture; Computer architecture; Costs; data
movement; distributed data register organization;
Embedded computing; embedded RISC processor; Embedded
system; embedded systems; Energy efficiency;
energy-efficient processor architecture; hierarchical
organization; inexpensive storage structures;
instruction registers; instruction sets; Logic; Mobile
processors; pipeline processing; pipeline registers;
Pipelines; Registers",
}
@Article{Anonymous:2008:FC,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c1--c1",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the front cover for this issue of the
publication.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2008:EBC,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c2--c2",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Pao:2008:PAM,
author = "D. Pao and W. Lin and B. Liu",
title = "Pipelined Architecture for Multi-String Matching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "33--36",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This letter presents a new oblivious routing algorithm
for 3D mesh networks called randomized
partially-minimal (RPM) routing that provably achieves
optimal worst- case throughput for 3D meshes when the
network radix fc is even and within a factor of 1/k2 of
optimal when k is odd. Although this optimality result
has been achieved with the minimal routing algorithm
OITURN for the 2D case, the worst-case throughput of
OITURN degrades tremendously in higher dimensions.
Other existing routing algorithms suffer from either
poor worst-case throughput (DOR, ROMM) or poor latency
(VAL). RPM on the other hand achieves near optimal
worst-case and good average-case throughput as well as
good latency performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D mesh networks; Automata; computer architecture;
Computer architecture; Computer science; Costs;
deterministic finite automaton; Hardware; Intrusion
detection; network intrusion detection; network radix;
OITURN; Partial response channels; pipelined
processing; Pipelines; randomized partially-minimal
routing; string matching; Table lookup;
three-dimensional mesh networks; Throughput",
}
@Article{Ramanujam:2008:RPM,
author = "R. Sunkam Ramanujam and B. Lin",
title = "Randomized Partially-Minimal Routing on
Three-Dimensional Mesh Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "37--40",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This letter presents a new oblivious routing algorithm
for 3D mesh networks called Randomized Partially-
Minimal (RPM) routing that provably achieves optimal
worst-case throughput for 3D meshes when the network
radix k is even and within a factor of 1/k2 of optimal
when k is odd. Although this optimality result has been
achieved with the minimal routing algorithm O1TURN [9]
for the 2D case, the worst-case throughput of O1TURN
degrades tremendously in higher dimensions. Other
existing routing algorithms suffer from either poor
worst-case throughput (DOR [10], ROMM [8]) or poor
latency (VAL [14]). RPM on the other hand achieves near
optimal worst-case and good average-case throughput as
well as good latency performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Degradation; Delay; Emerging technologies; Fabrics;
Interconnection architectures; Mesh networks; Network
communications; Network topology; On-chip
interconnection networks; Packet-switching networks;
Routing; Silicon; Technological innovation;
Telecommunication traffic; Throughput",
}
@Article{Black-Schaffer:2008:HIR,
author = "D. Black-Schaffer and J. Balfour and W. Dally and V.
Parikh and J. Park",
title = "Hierarchical Instruction Register Organization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "41--44",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper analyzes a range of architectures for
efficient delivery of VLIW instructions for embedded
media kernels. The analysis takes an efficient filter
cache as a baseline and examines the benefits from (1)
removing the tag overhead, (2) distributing the
storage, (3) adding indirection, (4) adding efficient
NOP generation, and (5) sharing instruction memory. The
result is a hierarchical instruction register
organization that provides a 56\% energy and 40\% area
savings over an already efficient filter cache.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Cache storage; Computer aided
instruction; Computer architecture; Computer integrated
manufacturing; distributed shared memory systems;
Embedded computing; embedded media kernel; embedded
processor architecture; embedded systems; filter cache;
Filters; hierarchical instruction register
organization; Instruction fetch; instruction memory
sharing; instruction sets; Kernel; Laboratories;
Low-power design; NOP generation; parallel
architectures; Registers; RISC/CISC; VLIW; VLIW
architectures; VLIW instruction delivery",
}
@Article{Lee:2008:PDD,
author = "J. Lee and X. Xiao",
title = "A Parallel Deadlock Detection Algorithm with {$ O(1)
$} Overall Run-time Complexity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "45--48",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This article proposes a novel parallel,
hardware-oriented deadlock detection algorithm for
multiprocessor system-on-chips. The proposed algorithm
takes full advantage of hardware parallelism in
computation and maintains information needed by
deadlock detection through classifying all resource
allocation events and performing class specific
operations, which together make the overall run-time
complexity of the new method O(1). We implement the
proposed algorithm in Verilog HDL and demonstrate in
the simulation that each algorithm invocation takes at
most four clock cycles in hardware.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithms implemented in hardware; clock cycle;
Computational modeling; Concurrent computing;
Deadlocks; Detection algorithms; Event detection;
hardware description languages; Hardware design
languages; hardware-oriented deadlock detection;
Multiprocessing systems; multiprocessing systems;
multiprocessor system-on-chips; operating systems
(computers); parallel deadlock detection; Parallel
processing; Real-time and embedded systems; resource
allocation; Resource management; run-time complexity;
Runtime; System recovery; system-on-chip; Verilog HDL",
}
@Article{GomezRequena:2008:BFT,
author = "C. {Gomez Requena} and F. Gilabert Villamon and M.
Gomez and P. Lopez and J. Duato",
title = "Beyond Fat-tree: Unidirectional Load--Balanced
Multistage Interconnection Network",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "49--52",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
note = "See comment \cite{Antelo:2009:CBF}.",
abstract = "The fat-tree is one of the most widely-used topologies
by interconnection network manufacturers. Recently, it
has been demonstrated that a deterministic routing
algorithm that optimally balances the network traffic
can not only achieve almost the same performance than
an adaptive routing algorithm but also outperforms it.
On the other hand, fat-trees require a high number of
switches with a non-negligible wiring complexity. In
this paper, we propose replacing the fat-tree by a
unidirectional multistage interconnection network
(UMIN) that uses a traffic balancing deterministic
routing algorithm. As a consequence, switch hardware is
almost reduced to the half, decreasing, in this way,
the power consumption, the arbitration complexity, the
switch size itself, and the network cost. Preliminary
evaluation results show that the UMIN with the load
balancing scheme obtains lower latency than fat-tree
for low and medium traffic loads. Furthermore, in
networks with a high number of stages or with high
radix switches, it obtains the same, or even higher,
throughput than fat-tree.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive routing algorithm; Butterfly Network;
computational complexity; Cost-efficiency; Costs;
Deterministic Routing; Energy consumption; Fat-trees;
Hardware; interconnection network manufacturers;
Manufacturing; Multiprocessor interconnection networks;
Multistage Interconnection Networks; Network
Architecture and Design; Network topology; network
traffic; nonnegligible wiring complexity; power
consumption; radix switches; Routing; Switches;
telecommunication network routing; telecommunication
switching; Telecommunication traffic; telecommunication
traffic; Traffic Balancing; traffic balancing
deterministic routing algorithm; trees (mathematics);
unidirectional load-balanced multistage interconnection
network; Wiring",
}
@Article{Li:2008:TAN,
author = "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun",
title = "Transaction-Aware Network-on-Chip Resource
Reservation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "53--56",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Performance and scalability are critically-important
for on-chip interconnect in many-core
chip-multiprocessor systems. Packet-switched
interconnect fabric, widely viewed as the de facto
on-chip data communication backplane in the many-core
era, offers high throughput and excellent scalability.
However, these benefits come at the price of router
latency due to run-time multi-hop data buffering and
resource arbitration. The network accounts for a
majority of on-chip data transaction latency. In this
work, we propose dynamic in-network resource
reservation techniques to optimize run-time on-chip
data transactions. This idea is motivated by the need
to preserve existing abstraction and general-purpose
network performance while optimizing for
frequently-occurring network events such as data
transactions. Experimental studies using multithreaded
benchmarks demonstrate that the proposed techniques can
reduce on-chip data access latency by 28.4\% on average
in a 16-node system and 29.2\% on average in a 36-node
system.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Backplanes; buffer storage; Computer buffers; data
communication; Data communication; de facto on-chip
data communication backplane; Delay; dynamic in-network
resource reservation techniques; Fabrics;
frequently-occurring network events; Interconnection
architectures; Interconnections (Subsystems); many-core
chip-multiprocessor systems; multiprocessor
interconnection networks; Network-on-a-chip; on-chip
data transaction latency; On-chip interconnection
networks; packet switching; packet-switched
interconnect fabric; Parallel Architectures; resource
allocation; router latency; run-time multihop data
buffering; Runtime; Scalability; System-on-a-chip;
telecommunication network routing; Throughput;
transaction-aware network-on-chip resource
reservation",
}
@Article{Fide:2008:PUS,
author = "S. Fide and S. Jenks",
title = "Proactive Use of Shared {L3} Caches to Enhance Cache
Communications in Multi-Core Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "57--60",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The software and hardware techniques to exploit the
potential of multi-core processors are falling behind,
even though the number of cores and cache levels per
chip is increasing rapidly. There is no explicit
communications support available, and hence inter-core
communications depend on cache coherence protocols,
resulting in demand-based cache line transfers with
their inherent latency and overhead. In this paper, we
present software controlled eviction (SCE) to improve
the performance of multithreaded applications running
on multi-core processors by moving shared data to
shared cache levels before it is demanded from remote
private caches. Simulation results show that SCE offers
significant performance improvement (8-28\%) and
reduces L3 cache misses by 88-98\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence protocol; cache communication; cache
storage; Concurrent computing; Control systems;
Degradation; Delay; demand-based cache line transfer;
Hardware; intercore communications; microprocessor
chips; Multi-core/single-chip multiprocessors;
multi-threading; Multicore processing; multicore
processors; multithreaded application; Parallel
processing; Protocols; shared L3 cache; shared memory
systems; software controlled eviction; Software
performance; Support for multi-threaded execution",
}
@Article{Walter:2008:BBE,
author = "I. Walter and I. Cidon and A. Kolodny",
title = "{BENoC}: a Bus-Enhanced Network on-Chip for a Power
Efficient {CMP}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "61--64",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Network-on-chips (NoCs) outperform buses in terms of
scalability, parallelism and system modularity and
therefore are considered as the main interconnect
infrastructure in future chip multi-processor (CMP).
However, while NoCs are very efficient for delivering
high throughput point-to-point data from sources to
destinations, their multi-hop operation is too slow for
latency sensitive signals. In addition, current NoCS
are inefficient for broadcast operations and
centralized control of CMP resources. Consequently,
state-of-the-art NoCs may not facilitate the needs of
future CMP systems. In this paper, the benefit of
adding a low latency, customized shared bus as an
internal part of the NoC architecture is explored.
BENoC (bus-enhanced network on-chip) possesses two main
advantages: First, the bus is inherently capable of
performing broadcast transmission in an efficient
manner. Second, the bus has lower and more predictable
propagation latency. In order to demonstrate the
potential benefit of the proposed architecture, an
analytical comparison of the power saving in BENoC
versus a standard NoC providing similar services is
presented. Then, simulation is used to evaluate BENoC
in a dynamic non-uniform cache access (DNUCA)
multiprocessor system.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "broadcast transmission; Broadcasting; bus-enhanced
network-on-chip; Centralized control; chip
multiprocessor; Delay; dynamic nonuniform cache access;
integrated circuit interconnections; interconnect
infrastructure; Interconnection architectures;
low-power electronics; microprocessor chips;
multiprocessing systems; Multiprocessing systems;
Multiprocessor interconnection networks;
Network-on-a-chip; network-on-chip; NoC; On-chip
interconnection networks; power efficient CMP; Power
system interconnection; propagation latency;
Scalability; system buses; System-on-a-chip;
Throughput",
}
@Article{Golander:2008:DDS,
author = "A. Golander and S. Weiss and R. Ronen",
title = "{DDMR}: Dynamic and Scalable Dual Modular Redundancy
with Short Validation Intervals",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "65--68",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DMR (dual modular redundancy) was suggested for
increasing reliability. Classical DMR consists of pairs
of cores that check each other and are pre-connected
during manufacturing by dedicated links. In this paper
we introduce the dynamic dual modular redundancy (DDMR)
architecture. DDMR supports run-time scheduling of
redundant threads, which has significant benefits
relative to static binding. To allow dynamic pairing,
DDMR replaces the special links with a novel ring
architecture. DDMR uses short instruction sequences for
validation, smaller than the processor reorder buffer.
Such short sequences reduce latencies in parallel
programs and save resources needed to buffer
uncommitted data. DDMR scales with the number of cores
and may be used in large multicore architectures.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "buffer storage; DDMR; Delay; dynamic dual modular
redundancy; Job shop scheduling; Joining processes;
Manufacturing; Multi-core/single-chip multiprocessors;
multicore architectures; Multicore processing; parallel
architectures; parallel programs; processor reorder
buffer; processor scheduling; Processor scheduling;
Proposals; Redundancy; Redundant design; ring
architecture; run-time scheduling; scalable dual
modular redundancy; short validation intervals;
Transistors",
}
@Article{Anonymous:2008:IA,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c3--c3",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides instructions and guidelines to prospective
authors who wish to submit manuscripts.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2008:ICS,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover 4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c4--c4",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ramanujam:2009:WRR,
author = "Rohit Sunkam Ramanujam and Bill Lin",
title = "Weighted Random Routing on Torus Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we introduce a new closed-form
oblivious routing algorithm called W2TURN that is
worst-case throughput optimal for 2D-torus networks.
W2TURN is based on a weighted random selection of paths
that contain at most two turns. In terms of average hop
count, W2TURN outperforms the best previously known
closed-form worst-case throughput optimal routing
algorithm called IVAL [7]. In addition, we present a
new optimal weighted random routing algorithm for rings
called WRD.",
acknowledgement = ack-nhfb,
affiliation = "Ramanujam, RS (Reprint Author), Univ Calif San Diego,
San Diego, CA 92103 USA. Ramanujam, Rohit Sunkam; Lin,
Bill, Univ Calif San Diego, San Diego, CA 92103 USA.",
author-email = "rsunkamr@ucsd.edu billlin@ucsd.edu",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "2D-torus networks; Algorithm design and analysis;
closed-form oblivious routing algorithm; Data
communications; Delay; Interconnection network;
internetworking; IVAL; latency; Measurement;
Multiprocessor interconnection networks;
Network-on-a-chip; oblivious routing; Oblivious
Routing; On-chip interconnection networks; optimal
weighted random routing algorithm; Routing; Runtime;
System recovery; telecommunication network routing;
throughput; Throughput; torus network; Torus Network;
W2TURN; weighted random path selection",
number-of-cited-references = "8",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009",
times-cited = "2",
unique-id = "Ramanujam:2009:WRR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ahn:2009:MDE,
author = "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber
and Norman P. Jouppi",
title = "Multicore {DIMM}: an Energy Efficient Memory Module
with Independently Controlled {DRAMs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Demand for memory capacity and bandwidth keeps
increasing rapidly in modern computer systems, and
memory power consumption is becoming a considerable
portion of the system power budget. However, the
current DDR DIMM standard is not well suited to
effectively serve CMP memory requests from both a power
and performance perspective. We propose a new memory
module called a Multicore DIMM, where DRAM chips are
grouped into multiple virtual memory devices, each of
which has its own data path and receives separate
commands (address and control signals). The Multicore
DIMM is designed to improve the energy efficiency of
memory systems with small impact on system performance.
Dividing each memory modules into 4 virtual memory
devices brings a simultaneous 22\%, 7.6\%, and 18\%
improvement in memory power, IPC, and system
energy-delay product respectively on a set of
multithreaded applications and consolidated
workloads.",
acknowledgement = ack-nhfb,
affiliation = "Ahn, JH (Reprint Author), Hewlett Packard Labs,
Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber,
Robert S.; Jouppi, Norman P., Hewlett Packard Labs,
Mississauga, ON, Canada. Leverich, Jacob, Stanford
Univ, Stanford, CA 94305 USA.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; CMP memory requests; Control systems; DDR
DIMM standard; DRAM; DRAM chips; Energy consumption;
Energy efficiency; energy efficiency; energy efficient
memory module; Energy-aware systems; Error correction
codes; independently controlled DRAM; Jacobian
matrices; memory capacity; memory module; memory power
consumption; Memory Structures; memory system;
microprocessor chips; Multicore; multicore DIMM;
Multicore processing; Proposals; Random access memory;
System performance; system power budget; virtual memory
devices",
number-of-cited-references = "16",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
research-areas = "Computer Science",
researcherid-numbers = "Ahn, Jung Ho/D-1298-2013",
times-cited = "26",
unique-id = "Ahn:2009:MDE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2009:PST,
author = "Po-Han Wang and Yen-Ming Chen and Chia-Lin Yang and
Yu-Jung Cheng",
title = "A Predictive Shutdown Technique for {GPU} Shader
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As technology continues to shrink, reducing leakage is
critical to achieve energy efficiency. Previous works
on low-power GPU (Graphics Processing Unit) focus on
techniques for dynamic power reduction, such as DVFS
(Dynamic Voltage/Frequency Scaling) and clock gating.
In this paper, we explore the potential of adopting
architecture-level power gating techniques for leakage
reduction on GPU. In particular, we focus on the most
power-hungry components, shader processors. We observe
that, due to different scene complexity, the required
shader resources to satisfy the target frame rate
actually vary across frames. Therefore, we propose the
Predictive Shader Shutdown technique to exploit
workload variation across frames for leakage reduction
on shader processors. The experimental results show
that Predictive Shader Shutdown achieves up to 46\%
leakage reduction on shader processors with negligible
performance degradation.",
acknowledgement = ack-nhfb,
affiliation = "Wang, PH (Reprint Author), Natl Taiwan Univ, Dept Comp
Sci \& Informat Engn, Taipei 10764, Taiwan. Wang,
Po-Han; Chen, Yen-Ming; Yang, Chia-Lin, Natl Taiwan
Univ, Dept Comp Sci \& Informat Engn, Taipei 10764,
Taiwan. Cheng, Yu-Jung, Natl Taiwan Univ, Grad Inst
Networking \& Multimedia, Taipei 10764, Taiwan.",
author-email = "r96002@csie.ntu.edu.tw r95125@csie.ntu.edu.tw
yangc@csie.ntu.edu.tw d96944002@ntu.edu.tw",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Institute for Information Industry of
Taiwan [97-FS-C03]; National Taiwan University
[97R0062-05]",
funding-text = "This work was partially supported by the Institute for
Information Industry of Taiwan under project No.
97-FS-C03, and by the Excellent Research Projects of
National Taiwan University, 97R0062-05.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture-level power gating techniques; Central
Processing Unit; Circuits; clock gating; Clocks;
computer architecture; computer graphic equipment;
Computer science; coprocessors; Degradation; dynamic
power reduction; Dynamic voltage scaling; dynamic
voltage-frequency scaling; Energy efficiency;
Energy-aware systems; Frequency; GPU; GPU shader
processors; Graphics; graphics processing unit; Layout;
leakage; Low-power design; power aware computing; power
gating; predictive shader shutdown technique",
number-of-cited-references = "15",
ORCID-numbers = "YANG, CHIA-LIN/0000-0003-0091-5027",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Wang:2009:PST",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Barnes:2009:XBA,
author = "Christopher Barnes and Pranav Vaidya and Jaehwan John
Lee",
title = "An {XML}-Based {ADL} Framework for Automatic
Generation of Multithreaded Computer Architecture
Simulators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Computer architecture simulation has always played a
pivotal role in continuous innovation of computers.
However, constructing or modifying a high quality
simulator is time consuming and error-prone. Thus,
often Architecture Description Languages (ADLs) are
used to provide an abstraction layer for describing the
computer architecture and automatically generating
corresponding simulators. Along the line of such
research, we present a novel XML-based ADL, its
compiler, and a generation methodology to automatically
generate multithreaded simulators for computer
architecture. We utilize the industry-standard
extensible markup language XML to describe the
functionality and architecture of a modeled processor.
Our ADL framework allows users to easily and quickly
modify the structure, register set, and execution of a
modeled processor. To prove its validity, we have
generated several multithreaded simulators with
different configurations based on the MIPS five-stage
processor, and successfully tested with two programs.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "IUPUI RSFG",
funding-text = "This research was funded by the IUPUI RSFG grant.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstraction layer; Architecture description languages;
automatic generation; C.0.d Modeling of computer
architecture; C.1.1.b Pipeline processors;
Computational modeling; computer architecture; Computer
architecture; Computer simulation; Concurrent
computing; extensible markup language-architecture
description language; Kernel; MIPS five-stage
processor; Modeling of computer architecture;
multi-threading; multithreaded computer architecture
simulator; Object oriented modeling; Pipeline
processors; Pipelines; program compilers; program
verification; Testing; validity testing; XML; XML-based
ADL framework",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Barnes:2009:XBA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Luque:2009:CAC,
author = "Carlos Luque and Miquel Moreto and Francisco J.
Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu
and Mateo Valero",
title = "{CPU} Accounting in {CMP} Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Chip-MultiProcessors (CMP) introduce complexities when
accounting CPU utilization to processes because the
progress done by a process during an interval of time
highly depends on the activity of the other processes
it is co-scheduled with. We propose a new hardware
accounting mechanism to improve the accuracy when
measuring the CPU utilization in CMPs and compare it
with the previous accounting mechanisms. Our results
show that currently known mechanisms could lead to a
12\% average error when it comes to CPU utilization
accounting. Our proposal reduces this error to less
than 1\% in a modeled 4-core processor system.",
acknowledgement = ack-nhfb,
affiliation = "Luque, C (Reprint Author), Univ Politecn Cataluna,
E-08028 Barcelona, Spain. Luque, Carlos; Moreto,
Miquel; Valero, Mateo, Univ Politecn Cataluna, E-08028
Barcelona, Spain. Cazorla, Francisco J.; Valero, Mateo,
Barcelona Supercomp Ctr, Barcelona, Spain.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Ministry of Science and Technology of Spain
[TIN-2007-60625, BES-2008-003683, AP-2005-3318]; HiPEAC
Network of Excellence [IST-004408]; IBM Research; IBM
Deep Computing organizations",
funding-text = "This work has been supported by the Ministry of
Science and Technology of Spain under contract
TIN-2007-60625 and grants BES-2008-003683 and
AP-2005-3318, by the HiPEAC Network of Excellence
(IST-004408) and a Collaboration Agreement between IBM
and BSC with funds from IBM Research and IBM Deep
Computing organizations. The authors would like to
thank Pradip Bose and Chen-Yong Cher from IBM for their
technical support.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "4-core processor system; Bandwidth; Cache memory;
chip-multiprocessor architecture; Clocks; CMP processor
system; CPU utilization accounting; data center;
General; Hardware; hardware accounting mechanism;
Hardware/software interfaces; Kernel; microprocessor
chips; Multi-core/single-chip multiprocessors;
multiprocessing systems; operating system task
scheduling; Operating systems; process scheduling;
processor scheduling; Proposals; resource allocation;
Semiconductor device measurement; Switches",
number-of-cited-references = "11",
oa = "Green Published",
ORCID-numbers = "Moreto Planas, Miquel/0000-0002-9848-8758 Cazorla,
Francisco/0000-0002-3344-376X Luque,
Carlos/0000-0003-0442-0785 Valero,
Mateo/0000-0003-2917-2482 Gioiosa,
Roberto/0000-0001-9430-2656",
research-areas = "Computer Science",
researcherid-numbers = "Moreto Planas, Miquel/C-1823-2016 Cazorla,
Francisco/D-7261-2016 Luque, Carlos/E-2110-2019 Valero,
Mateo/L-5709-2014",
times-cited = "5",
unique-id = "Luque:2009:CAC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Soteriou:2009:HTD,
author = "Vassos Soteriou and Rohit Sunkam Ramanujam and Bill
Lin and Li-Shiuan Peh",
title = "A High-Throughput Distributed Shared-Buffer {NoC}
Router",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Microarchitectural configurations of buffers in
routers have a significant impact on the overall
performance of an on-chip network (NoC). This buffering
can be at the inputs or the outputs of a router,
corresponding to an input-buffered router (IBR) or an
output-buffered router (OBR). OBRs are attractive
because they have higher throughput and lower queuing
delays under high loads than IBRs. However, a direct
implementation of OBRs requires a router speedup equal
to the number of ports, making such a design
prohibitive given the aggressive clocking and power
budgets of most NoC applications. In this letter, we
propose a new router design that aims to emulate an OBR
practically based on a distributed shared-buffer (DSB)
router architecture. We introduce innovations to
address the unique constraints of NoCs, including
efficient pipelining and novel flow control. Our DSB
design can achieve significantly higher bandwidth at
saturation, with an improvement of up to 20\% when
compared to a state-of-the-art pipelined IBR with the
same amount of buffering, and our proposed
microarchitecture can achieve up to 94\% of the ideal
saturation throughput.",
acknowledgement = ack-nhfb,
affiliation = "Ramanujam, Rohit Sunkam; Lin, Bill, Univ Calif San
Diego, San Diego, CA 92103 USA. Peh, Li-Shiuan,
Princeton Univ, Princeton, NJ 08544 USA.",
author-email = "vassos.soteriou@cut.ac.cy rsunkamr@ucsd.edu
billlin@ucsd.edu peh@princeton.edu",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; buffer circuits; Clocks; Computer
architecture; configuration management; Delay;
distributed shared-buffer; Interconnection
architectures; Internet; microarchitectural
configurations; Microarchitecture; network routing;
Network-on-a-chip; network-on-chip; NoC router; On-chip
interconnection networks; output-buffered router;
Pipeline processing; router architecture; Router
micro-architecture; Technological innovation;
Throughput",
keywords-plus = "ARCHITECTURE",
number-of-cited-references = "16",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X Soteriou,
Vassos/0000-0002-2818-0459",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009 Soteriou,
Vassos/H-4603-2014",
times-cited = "15",
unique-id = "Soteriou:2009:HTD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Guz:2009:MCV,
author = "Zvika Guz and Evgeny Bolotin and Idit Keidar and
Avinoam Kolodny and Avi Mendelson and Uri C. Weiser",
title = "Many-Core vs. Many-Thread Machines: Stay Away From the
Valley",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We study the tradeoffs between Many-Core machines like
Intel's Larrabee and Many-Thread machines like Nvidia
and AMD GPGPUs. We define a unified model describing a
superposition of the two architectures, and use it to
identify operation zones for which each machine is more
suitable. Moreover, we identify an intermediate zone in
which both machines deliver inferior performance. We
study the shape of this ``performance valley'' and
provide insights on how it can be avoided.",
acknowledgement = ack-nhfb,
affiliation = "Guz, Z (Reprint Author), Technion Israel Inst Technol,
EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar,
Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel
Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin,
Evgeny, Intel Corp, Santa Clara, CA 95051 USA.
Mendelson, Avi, Microsoft Corp, Redmond, WA 98052
USA.",
author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com
idish@ee.technion.ac.il kolodny@ee.technion.ac.il
avim@microsoft.com uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Semiconductors Research Corporation (SRC);
Intel; Israeli Ministry of Science Knowledge Center on
Chip MultiProcessors",
funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner.
This work was partially supported by Semiconductors
Research Corporation (SRC), Intel, and the Israeli
Ministry of Science Knowledge Center on Chip
MultiProcessors.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AMD GPGPU; architecture superposition; Bandwidth; Chip
Multiprocessors; Computer Systems; coprocessors; Delay;
Engines; Equations; GPGPU; Graphics; Intelpsilas
Larrabee; many-core machines; many-thread machines;
Multi-core/single-chip multiprocessors;
multi-threading; multiprocessing systems; Nvidia GPGPU;
Parallel Architectures; parallel architectures;
Parallel processing; performance valley; Processor
Architectures; Shape",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "27",
unique-id = "Guz:2009:MCV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Desai:2009:AIC,
author = "Aniruddha Desai and Jugdutt Singh",
title = "Architecture Independent Characterization of Embedded
{Java} Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2000.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "This paper presents architecture independent
characterization of embedded Java workloads based on
the industry standard GrinderBench benchmark which
includes different classes of real world embedded Java
applications. This work is based on a custom built
embedded Java Virtual Machine (JVM) simulator
specifically designed for embedded JVM modeling and
embodies domain specific details such as thread
scheduling, algorithms used for native CLDC APIs and
runtime data structures optimized for use in embedded
systems. The results presented include dynamic
execution characteristics, dynamic bytecode instruction
mix, application and API workload distribution, Object
allocation statistics, instruction-set coverage, memory
usage statistics and method code and stack frame
characteristics.",
acknowledgement = ack-nhfb,
affiliation = "Desai, A (Reprint Author), La Trobe Univ, Bundoora,
Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt,
La Trobe Univ, Bundoora, Vic 3086, Australia.",
author-email = "desai@ieee.org",
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; application program
interfaces; architecture independent characterization;
CLDC API; custom built embedded Java virtual machine
simulator; data structures; Data structures; Design
optimization; dynamic bytecode instruction mix; dynamic
execution characteristics; embedded Java workload;
Embedded Systems; embedded systems; Embedded Systems;
industry standard GrinderBench benchmark; instruction
sets; instruction-set coverage; Java; Java bytecode;
Job shop scheduling; JVM; memory usage statistics;
method code characteristics; multi-threading; object
allocation statistics; Runtime; runtime data structure;
scheduling; Scheduling algorithm; stack frame
characteristics; Statistical distributions; storage
allocation; thread scheduling; virtual machines;
Virtual machining; Workload Characterization",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Desai:2009:AIC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Antelo:2009:CBF,
author = "Elisardo Antelo",
title = "A Comment on {``Beyond Fat-tree: Unidirectional
Load-Balanced Multistage Interconnection Network''}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "33--34",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
note = "See \cite{GomezRequena:2008:BFT}.",
abstract = "A recent work proposed to simplify fat-trees with
adaptive routing by means of a load-balancing
deterministic routing algorithm. The resultant network
has performance figures comparable to the more complex
adaptive routing fat-trees when packets need to be
delivered in order. In a second work by the same
authors published in IEEE CAL, they propose to simplify
the fat-tree to a unidirectional multistage
interconnection network (UMIN), using the same
load-balancing deterministic routing algorithm. They
show that comparable performance figures are achieved
with much lower network complexity. In this comment we
show that the proposed load-balancing deterministic
routing is in fact the routing scheme used by the
butterfly network. Moreover we show that the properties
of the simplified UMIN network proposed by them are
intrinsic to the standard butterfly and other existing
UMINs",
acknowledgement = ack-nhfb,
affiliation = "Antelo, E (Reprint Author), Univ Santiago de
Compostela, Dept Elect \& Comp Sci, Santiago De
Compostela, Spain. Univ Santiago de Compostela, Dept
Elect \& Comp Sci, Santiago De Compostela, Spain.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive routing fat-trees; Bismuth; butterfly
network; Computer science; deterministic algorithms;
fat-tree; hypercube networks; Interconnection networks;
Interconnections (Subsystems); load balancing
deterministic routing algorithm; Logic functions;
Multiprocessor interconnection networks; Multistage
Interconnection networks; network complexity; Network
topology; packets; resource allocation; Routing;
Switches; Technological innovation; Topology;
unidirectional load-balanced multistage interconnection
network; unidirectional multistage interconnection
network",
number-of-cited-references = "7",
ORCID-numbers = "Antelo, Elisardo/0000-0003-3743-3689",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Antelo:2009:CBF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2009:Aa,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "35--35",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.38",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:AIC,
author = "Anonymous",
title = "Ad --- {IEEE Computer Society Digital Library}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "36--36",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.39",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:EBCa,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.41",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:FCa,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.40",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:IAa,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.42",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.43",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Gaudiot:2009:INE,
author = "Jean-Luc Gaudiot",
title = "Introducing the New {Editor-in-Chief} of
{{\booktitle{IEEE Computer Architecture Letters}}}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "37--38",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.60",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Gaudiot:2009:INE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Skadron:2009:LE,
author = "K. Skadron",
title = "Letter from the {Editor}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "39--39",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.61",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2009:U,
author = "Kevin Skadron",
title = "Untitled",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "39--39",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.61",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2009:U",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xin:2009:ELI,
author = "Jing Xin and Russ Joseph",
title = "Exploiting Locality to Improve Circuit-level Timing
Speculation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "40--43",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.50",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Circuit-level timing speculation has been proposed as
a technique to reduce dependence on design margins,
eliminating power and performance overheads. Recent
work has proposed microarchitectural methods to
dynamically detect and recover from timing errors in
processor logic. This work has not evaluated or
exploited the disparity of error rates at the level of
static instructions. In this paper, we demonstrate
pronounced locality in error rates at the level of
static instructions. We propose timing error prediction
to dynamically anticipate timing errors at the
instruction-level and reduce the costly recovery
penalty. This allows us to achieve 43.6\% power savings
when compared to a baseline policy and incurs only
6.9\% performance penalty.",
acknowledgement = ack-nhfb,
affiliation = "Xin, J (Reprint Author), Northwestern Univ, Evanston,
IL 60208 USA. Xin, Jing; Joseph, Russ, Northwestern
Univ, Evanston, IL 60208 USA.",
da = "2019-06-20",
doc-delivery-number = "V17GD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0644332, CNS-0720820]",
funding-text = "Manuscript submitted: 17-Sep-2009. Manuscript
accepted: 08-Oct-2009. Final manuscript received:
15-Oct-2009. We thank the anonymous reviewers for their
constructive feedback. This work was supported by NSF
awards CAREER CCF-0644332 and CNS-0720820.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Circuit faults; circuit reliability; circuit-level
timing speculation; Costs; Delay; Dynamic voltage
scaling; Error analysis; Error locality; Frequency;
Hardware; instruction sets; Logic; logic design;
low-power design; Low-power design; microarchitectural
methods; microprocessor chips; Pipelines; power
elimination; processor logic; reliability; Reliability;
static instruction level; Testing and Fault-Tolerance;
Timing; timing error prediction; timing speculation",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Xin:2009:ELI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sudarsanam:2009:PPD,
author = "Arvind Sudarsanam and Ramachandra Kallam and Aravind
Dasu",
title = "{PRR--PRR} Dynamic Relocation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "44--47",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.49",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Partial bitstream relocation (PBR) on FPGAs has been
gaining attention in recent years as a potentially
promising technique to scale parallelism of accelerator
architectures at run time, enhance fault tolerance,
etc. PBR techniques to date have focused on reading
inactive bitstreams stored in memory, on-chip or
off-chip, whose contents are generated for a specific
partial reconfiguration region (PRR) and modified on
demand for configuration into a PRR at a different
location. As an alternative, we propose a PRR-PRR
relocation technique to generate source and destination
addresses, read the bitstream from an active PRR
(source) in a non-intrusive manner, and write it to
destination PRR. We describe two options of realizing
this on Xilinx Virtex 4 FPGAs: (a) hardware-based
accelerated relocation circuit (ARC) and (b) a software
solution executed on Microblaze. A comparative
performance analysis to highlight the speed-up obtained
using ARC is presented. For real test cases,
performance of our implementations are compared to
estimated performances of two state of the art
methods.",
acknowledgement = ack-nhfb,
affiliation = "Sudarsanam, A (Reprint Author), Utah State Univ, Dept
Elect \& Comp Engn, Logan, UT 84321 USA. Sudarsanam,
Arvind; Kallam, Ramachandra; Dasu, Aravind, Utah State
Univ, Dept Elect \& Comp Engn, Logan, UT 84321 USA.",
author-email = "arvind.sudarsanam@aggiemail.usu.edu
ramachandra.kallam@aggiemail.usu.edu
dasu@engineering.usu.edu",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NASA; Micron Research Center",
funding-text = "Manuscript submitted: 03-Aug-2009. Manuscript
accepted: 16-Sep-2009. Final manuscript received:
24-Sep-2009. This work was supported by NASA and Micron
Research Center.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Accelerator architectures; accelerator
architectures; Bioreactors; Circuits; destination
address; Emerging technologies; Fault tolerance; fault
tolerance; field programmable gate arrays; Field
programmable gate arrays; Filters; FPGAs; Hardware;
hardware-based accelerated relocation circuit; parallel
architecture; parallel architectures; Parallel
processing; partial bitstream relocation; Partial
dynamic reconfiguration; Partial dynamic relocation;
partial reconfiguration region; PBR techniques;
Performance analysis; Performance Analysis and Design
Aids; PRR-PRR dynamic relocation technique; PRR-PRR
relocation technique; Reconfigurable computing;
Reconfigurable hardware; source address; Xilinx Virtex
4 FPGA",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Sudarsanam:2009:PPD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Leverich:2009:PMD,
author = "Jacob Leverich and Matteo Monchiero and Vanish Talwar
and Partha Ranganathan and Christos Kozyrakis",
title = "Power Management of Datacenter Workloads Using
Per-Core Power Gating",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "48--51",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.46",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "While modern processors offer a wide spectrum of
software-controlled power modes, most datacenters only
rely on Dynamic Voltage and Frequency Scaling (DVFS,
a.k.a. P-states) to achieve energy efficiency. This
paper argues that, in the case of datacenter workloads,
DVFS is not the only option for processor power
management. We make the case for per-core power gating
(PCPG) as an additional power management knob for
multi-core processors. PCPG is the ability to cut the
voltage supply to selected cores, thus reducing to
almost zero the leakage power for the gated cores.
Using a testbed based on a commercial 4-core chip and a
set of real-world application traces from enterprise
environments, we have evaluated the potential of PCPG.
We show that PCPG can significantly reduce a
processor's energy consumption (up to 40\%) without
significant performance overheads. When compared to
DVFS, PCPG is highly effective saving up to 30\% more
energy than DVFS. When DVFS and PCPG operate together
they can save up to almost 60\%.",
acknowledgement = ack-nhfb,
affiliation = "Leverich, J (Reprint Author), Hewlett Packard Labs,
Mississauga, ON, Canada. Leverich, Jacob; Monchiero,
Matteo; Talwar, Vanish; Ranganathan, Partha, Hewlett
Packard Labs, Mississauga, ON, Canada. Leverich, Jacob;
Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
USA.",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; computer centres; Costs; data
center workloads; dynamic voltage and frequency
scaling; Dynamic voltage scaling; Energy consumption;
energy efficiency; Energy management; Energy-aware
systems; enterprise environments; Frequency;
integration and modeling; Jacobian matrices; leakage
power; microprocessor chips; Multicore processing;
multicore processors; per-core power gating; power
consumption; Power supplies; processor energy
consumption; processor power management;
software-controlled power modes; System architectures;
Testing",
number-of-cited-references = "10",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "43",
unique-id = "Leverich:2009:PMD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Musoll:2009:PVA,
author = "Enric Musoll",
title = "A Process-Variation Aware Technique for Tile-Based,
Massive Multicore Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "52--55",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.48",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Process variations in advanced nodes introduce
significant core-to-core performance differences in
single-chip multicore architectures. Isolating each
core with its own frequency and voltage island helps
improving the performance of the multi-core
architecture by operating at the highest frequency
possible rather than operating all the cores at the
frequency of the slowest core. However, inter-core
communication suffers from additional
cross-clock-domain latencies that can offset the
performance benefits. This work proposes the concept of
the configurable, variable-size frequency and voltage
domain, and it is described in the context of a
tile-based, massive multi-core architecture.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; computer architecture; Context;
cross-clock-domain latency; Delay; Frequency; intercore
communication; massive multi-core; massive multicore
processors; Multi-core/single-chip multiprocessors;
multicore architecture; Multicore processing;
Network-on-a-chip; network-on-chip; On-chip
interconnection networks; Performance gain; Process
design; process-variation aware architecture;
process-variation aware technique; Runtime; single-chip
multicore architectures; tile-base architecture;
tile-based multicore processors; variable-size
frequency domain; Voltage; voltage domain",
number-of-cited-references = "5",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Musoll:2009:PVA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Baldassin:2009:CEC,
author = "Alexandro Baldassin and Felipe Klein and Guido Araujo
and Rodolfo Azevedo and Paulo Centoducatte",
title = "Characterizing the Energy Consumption of Software
Transactional Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "56--59",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.47",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The well-known drawbacks imposed by lock-based
synchronization have forced researchers to devise new
alternatives for concurrent execution, of which
transactional memory is a promising one. Extensive
research has been carried out on Software Transaction
Memory (STM), most of all concentrated on program
performance, leaving unattended other metrics of great
importance like energy consumption. This letter
presents a thorough evaluation of energy consumption in
a state-of-the-art STM. We show that energy and
performance results do not always follow the same trend
and, therefore, it might be appropriate to consider
different strategies depending on the focus of the
optimization. We also introduce a novel strategy based
on dynamic voltage and frequency scaling for contention
managers, revealing important energy and energy-delay
product improvements in high-contended scenarios. This
work is a first study towards a better understanding of
the energy consumption behavior of STM systems, and
could prompt STM designers to research new
optimizations in this area, paving the way for an
energy-aware transactional memory.",
acknowledgement = ack-nhfb,
affiliation = "Baldassin, A (Reprint Author), Univ Estadual Campinas,
Inst Comp, Campinas, SP, Brazil. Baldassin, Alexandro;
Klein, Felipe; Araujo, Guido; Azevedo, Rodolfo;
Centoducatte, Paulo, Univ Estadual Campinas, Inst Comp,
Campinas, SP, Brazil.",
author-email = "alebal@ic.unicamp.br klein@ic.unicamp.br
guido@ic.unicamp.br rodolfo@ic.unicamp.br
ducatte@ic.unicamp.br",
da = "2019-06-20",
doc-delivery-number = "V17GD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "FAPESP [2005/02565-9]",
funding-text = "Manuscript submitted: 02-Jul-2009. Manuscript
accepted: 23-Jul-2009. Final manuscript received:
05-Aug-2009. This work was supported in part by FAPESP
(2005/02565-9).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Concurrent computing; Concurrent Programming; Content
management; Costs; Dynamic voltage scaling; Energy
Consumption; Energy consumption; energy consumption;
Energy management; Energy-aware systems; energy-delay
product improvements; frequency scaling; Frequency
synchronization; Hardware; lock-based synchronization;
Measurement techniques; Memory management;
multiprocessing systems; Multiprocessor Systems;
multiprocessor systems; Multiprocessor Systems;
Parallel Architectures; parallel architectures; Power
Management; Software performance; software
transactional memory; synchronisation; transaction
processing; Transactional Memory",
number-of-cited-references = "13",
ORCID-numbers = "Azevedo, Rodolfo/0000-0002-8803-0401",
research-areas = "Computer Science",
researcherid-numbers = "Azevedo, Rodolfo/F-3008-2012",
times-cited = "3",
unique-id = "Baldassin:2009:CEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Balfour:2009:ORE,
author = "James Balfour and R. Curtis Harting and William J.
Dally",
title = "Operand Registers and Explicit Operand Forwarding",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "60--63",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.45",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Operand register files are small, inexpensive register
files that are integrated with function units in the
execute stage of the pipeline, effectively extending
the pipeline operand registers into register files.
Explicit operand forwarding lets software
opportunistically orchestrate the routing of operands
through the forwarding network to avoid writing
ephemeral values to registers. Both mechanisms let
software capture short-term reuse and locality close to
the function units, improving energy efficiency by
allowing a significant fraction of operands to be
delivered from inexpensive registers that are
integrated with the function units. An evaluation shows
that capturing operand bandwidth close to the function
units allows operand registers to reduce the energy
consumed in the register files and forwarding network
of an embedded processor by 61\%, and allows explicit
forwarding to reduce the energy consumed by 26\%.",
acknowledgement = ack-nhfb,
affiliation = "Balfour, J (Reprint Author), Stanford Univ, Comp Syst
Lab, Stanford, CA 94305 USA. Balfour, James; Harting,
R. Curtis; Dally, William J., Stanford Univ, Comp Syst
Lab, Stanford, CA 94305 USA.",
author-email = "jbalfour@cva.stanford.edu dally@cva.stanford.edu",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Code generation; Computer aided
instruction; Computer System Implementation; Computer
Systems Organizat; embedded processor; Energy capture;
energy consumption; energy efficient register
organization; explicit operand forwarding; explicit
operand forwarding network; Fixed-point arithmetic;
impact of technology trends; Impact of VLSI on system
design; Laboratories; Logic; low-power programmable
processors; Memory hierarchy; microprocessor chips;
operand bandwidth; operand register files; operand
registers; Optimization; Physically aware
micro-architecture: power; Pipelines; Real-time and
embedded systems; Registers; Routing; software
reusability; thermal; VLSI Systems; Writing",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Balfour:2009:ORE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chiou:2009:AFF,
author = "Derek Chiou and Hari Angepat and Nikhil A. Patil and
Dam Sunwoo",
title = "Accurate Functional-First Multicore Simulators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "64--67",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.44",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Fast and accurate simulation of multicore systems
requires a parallelized simulator. This paper describes
a novel method to build parallelizable and
cycle-accurate-capable functional-first simulators of
multicore targets.",
acknowledgement = ack-nhfb,
affiliation = "Chiou, D (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Chiou, Derek;
Angepat, Hari; Patil, Nikhil A.; Sunwoo, Dam, Univ
Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712
USA.",
author-email = "derek@ece.utexas.edu angepat@ece.utexas.edu
npatil@ece.utexas.edu sunwoo@ece.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [0615352,
0747438]",
funding-text = "This material is based upon work supported by the
National Science Foundation under Grants No. 0615352
and No. 0747438 and gifts from Intel and IBM. We thank
the anonymous reviewers for their comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "circuit simulation; Computational modeling; Computer
simulation; field programmable gate arrays;
FPGA-accelerated simulation technologies;
functional-first multicore simulators; Instruction
sets; integration and modeling; Microarchitecture;
Modeling and Visualization; Modeling of computer
architecture; Modeling techniques;
Multi-core/single-chip multiprocessors; Multicore
processing; multicore system simulation; Parallel;
Parallel Architectures; parallelized simulator;
Performance Analysis and Design Aids; Predictive
models; Simulation; Software prototyping; System
architectures; Timing; Virtual machining; Virtual
prototyping",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Chiou:2009:AFF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2009:Ab,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "68--68",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.52",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Ac,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "69--69",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.53",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Ad,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "70--70",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.55",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Ae,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "71--71",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.54",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Af,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "72--72",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.51",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:EBCb,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.57",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:FCb,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.56",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:IAb,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.58",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:ICSb,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.59",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Patil:2010:URT,
author = "Shruti Patil and David J. Lilja",
title = "Using Resampling Techniques to Compute Confidence
Intervals for the Harmonic Mean of Rate-Based
Performance Metrics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Rate-based metrics such as floating point operations
per second, instructions per cycle and so forth are
commonly used to measure computer performance. In
addition to the average or mean performance of the
metric, indicating the precision of the mean using
confidence intervals helps to make informed decisions
and comparisons with the data. In this paper, we
discuss the determination of confidence intervals for
the harmonic mean of rate-based metrics using two
statistical resampling techniques Jackknife and
Bootstrap. We show using Monte Carlo simulations that
resampling indeed works as expected, and can be used
for generating confidence intervals for harmonic
mean.",
acknowledgement = ack-nhfb,
affiliation = "Patil, S (Reprint Author), Univ Minnesota Twin Cities,
Dept Elect \& Comp Engn, St Paul, MN USA. Patil,
Shruti; Lilja, David J., Univ Minnesota Twin Cities,
Dept Elect \& Comp Engn, St Paul, MN USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0541162]",
funding-text = "This work was supported in part by the National
Science Foundation grant no. CCF-0541162. Any opinions,
findings and conclusions or recommendations expressed
in this material are those of the authors and do not
necessarily reflect the views of the NSF. The authors
also thank the University of Minnesota Statistical
Consulting Service for their helpful insights.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic; bootstrap; bootstrap technique; Cities and
towns; Computer errors; Computer performance; computer
performance measurement; Confidence intervals;
confidence intervals; Electric variables measurement;
Equations; floating point operations; Harmonic
analysis; harmonic mean; jackknife; jackknife
technique; Monte Carlo methods; Monte Carlo
simulations; Nonparametric statistics; Performance
analysis; performance evaluation; Performance of
Systems; Probability distribution; rate-based
performance metrics; resampling; statistical analysis;
statistical resampling techniques; Statistics",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Patil:2010:URT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seznec:2010:PCM,
author = "Andre Seznec",
title = "A Phase Change Memory as a Secure Main Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/prng.bib",
abstract = "Phase change memory (PCM) technology appears as more
scalable than DRAM technology. As PCM exhibits access
time slightly longer but in the same range as DRAMs,
several recent studies have proposed to use PCMs for
designing main memory systems. Unfortunately PCM
technology suffers from a limited write endurance;
typically each memory cell can be only be written a
large but still limited number of times (10(7) to 10(9)
writes are reported for current technology). Till now,
research proposals have essentially focused their
attention on designing memory systems that will survive
to the average behavior of conventional applications.
However PCM memory systems should be designed to
survive worst-case applications, i.e., malicious
attacks targeting the physical destruction of the
memory through overwriting a limited number of memory
cells.",
acknowledgement = ack-nhfb,
affiliation = "Seznec, A (Reprint Author), INRIA Rennes Bretagne
Atlantique, Ctr Rech, Campus Beaulieu, F-35042 Rennes,
France. INRIA Rennes Bretagne Atlantique, Ctr Rech,
F-35042 Rennes, France.",
author-email = "seznec@irisa.fr",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Commission [27648]",
funding-text = "This work was partially supported by the European
Commission in the context of the SARC integrated
project \#27648 (FP6).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; DRAM technology; Energy
consumption; memory cells; Memory Structures; PCM
memory systems; Phase change materials; phase change
memories; phase change memory; Phase change memory;
Physics computing; Proposals; Random access memory;
Random number generation; Random processes;
Scalability; secure PCM-based main memory;
Semiconductor Memories",
keywords-plus = "TECHNOLOGY",
number-of-cited-references = "8",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "17",
unique-id = "Seznec:2010:PCM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Park:2010:EIP,
author = "Seon-yeong Park and Euiseong Seo and Ji-Yong Shin and
Seungryoul Maeng and Joonwon Lee",
title = "Exploiting Internal Parallelism of Flash-based
{SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "For the last few years, the major driving force behind
the rapid performance improvement of SSDs has been the
increment of parallel bus channels between a flash
controller and flash memory packages inside the
solid-state drives (SSDs). However, there are other
internal parallelisms inside SSDs yet to be explored.
In order to improve performance further by utilizing
the parallelism, this paper suggests request
rescheduling and dynamic write request mapping.
Simulation results with real workloads have shown that
the suggested schemes improve the performance of the
SSDs by up to 15\% without any additional hardware
support.",
acknowledgement = ack-nhfb,
affiliation = "Park, SY (Reprint Author), Korea Adv Inst Sci \&
Technol, Taejon, South Korea. Park, Seon-yeong; Shin,
Ji-Yong; Maeng, Seungryoul, Korea Adv Inst Sci \&
Technol, Taejon, South Korea. Seo, Euiseong, Ulsan Natl
Inst Sci \& Technol, Ulsan, South Korea. Lee, Joonwon,
Sungkyunkwan Univ, Seoul, South Korea.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Korea government(MEST) [2009-0080381]",
funding-text = "This work was supported by the Korea Science and
Engineering Foundation (KOSEF) grant funded by the
Korea government (MEST), (No. 2009-080381)",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Delay; Drives; exploiting internal parallelism; flash
based SSD; flash controller; flash memories; Flash
memory; flash memory packages; Force control; Hard
disks; I/O scheduling; Input/Output Devices; Packaging;
parallel bus channels; parallel processing; Parallel
systems; parallelism; pipeline processing; Pipeline
processing; Secondary storage; Simulation; Solid state
circuits; solid state drives; Solid-State Drives
(SSDs); Space technology; Storage Management; system
buses; Throughput",
number-of-cited-references = "6",
research-areas = "Computer Science",
researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
times-cited = "35",
unique-id = "Park:2010:EIP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Subramoni:2010:ISI,
author = "Hari Subramoni and Fabrizio Petrini and Virat Agarwal
and Davide Pasetto",
title = "Intra-Socket and Inter-Socket Communication in
Multi-core Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The increasing computational and communication demands
of the scientific and industrial communities require a
clear understanding of the performance trade-offs
involved in multi-core computing platforms. Such
analysis can help application and toolkit developers in
designing better, topology aware, communication
primitives intended to suit the needs of various high
end computing applications. In this paper, we take on
the challenge of designing and implementing a portable
intra-core communication framework for streaming
computing and evaluate its performance on some popular
multi-core architectures developed by Intel, AMD and
Sun. Our experimental results, obtained on the Intel
Nehalem, AMD Opteron and Sun Niagara 2 platforms, show
that we are able to achieve an intra-socket small
message latency between 120 and 271 nanoseconds, while
the inter-socket small message latency is between 218
and 320 nanoseconds. The maximum intra-socket
communication bandwidth ranges from 0.179 (Sun Niagara
2) to 6.5 (Intel Nehalem) Gbytes/second. We were also
able to obtain an inter-socket communication
performance of 1.2 and 6.6 Gbytes/second on the AMD
Opteron and Intel Nehalem, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Subramoni, H (Reprint Author), IBM TJ Watson, Yorktown
Hts, NY 10598 USA. Subramoni, Hari; Petrini, Fabrizio;
Agarwal, Virat, IBM TJ Watson, Yorktown Hts, NY 10598
USA. Pasetto, Davide, IBM Computat Sci Ctr, Dublin,
Ireland. Subramoni, Hari, Ohio State Univ, Columbus, OH
43210 USA.",
author-email = "subramon@cse.ohio-state.edu fpetrin@us.ibm.com
viratagarwal@us.ibm.com pasetto\_davide@ie.ibm.com",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AMD Opteron; Bandwidth; Communication industry;
communication primitives; Communication Protocols;
Computer applications; Computer architecture; Computer
industry; Delay; General; Hardware; High Performance
Computing; industrial communities; Intel Nehalem;
intersocket communication; Intrasocket communication;
multicore architectures; Multicore Processors;
multicore systems; multiprocessing systems; parallel
architectures; Performance of Systems; Portable
computers; streaming computing; Sun; toolkit
developers; Topology; topology aware",
keywords-plus = "NETWORK",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Subramoni:2010:ISI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hoang:2010:CAN,
author = "Giang Hoang and Chang Bae and John Lange and Lide
Zhang and Peter Dinda and Russ Joseph",
title = "A Case for Alternative Nested Paging Models for
Virtualized Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Address translation often emerges as a critical
performance bottleneck for virtualized systems and has
recently been the impetus for hardware paging
mechanisms. These mechanisms apply similar translation
models for both guest and host address translations. We
make an important observation that the model employed
to translate from guest physical addresses (GPAs) to
host physical addresses (HPAs) is in fact orthogonal to
the model used to translate guest virtual addresses
(GVAs) to GPAs. Changing this model requires VMM
cooperation, but has no implications for guest OS
compatibility. As an example, we consider a hashed page
table approach for GPA -> HPA translation. Nested
paging, widely considered the most promising approach,
uses unhashed multi-level forward page tables for both
GVA -> GPA and GPA -> HPA translations, resulting in a
potential O(n(2)) page walk cost on a TLB miss, for
n-level page tables. In contrast, the hashed page table
approach results in an expected O(n) cost. Our
simulation results show that when a hashed page table
is used in the nested level, the performance of the
memory system is not worse, and sometimes even better
than a nested forward-mapped page table due to reduced
page walks and cache pressure. This showcases the
potential for alternative paging mechanisms.",
acknowledgement = ack-nhfb,
affiliation = "Hoang, GA (Reprint Author), Northwestern Univ,
Evanston, IL 60208 USA. Hoang, Giang; Bae, Chang;
Lange, John; Dinda, Peter; Joseph, Russ, Northwestern
Univ, Evanston, IL 60208 USA. Zhang, Lide, Univ
Michigan, Ann Arbor, MI 48109 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address translation; Computer Architecture; Computer
architecture; Computer Architecture; Computer displays;
Control systems; Costs; Emerging technologies; file
organisation; guest physical addresses; guest virtual
addresses; Hardware; hardware paging mechanisms;
Hardware/software interfaces; host physical addresses;
Instruction sets; Nested Paging; nested paging models;
Operating systems; OS compatibility; paged storage;
Platform virtualization; Software performance; storage
allocation; unhashed multilevel forward page tables;
virtual machine monitors; Virtual machine monitors;
virtual machines; Virtual Memory; Virtualization;
virtualized systems; VMM cooperation",
number-of-cited-references = "11",
research-areas = "Computer Science",
researcherid-numbers = "Joseph, Russell/B-7230-2009 Dinda,
Peter/B-7142-2009",
times-cited = "5",
unique-id = "Hoang:2010:CAN",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Krimer:2010:SNT,
author = "Evgeni Krimer and Robert Pawlowski and Mattan Erez and
Patrick Chiang",
title = "{Synctium}: a Near-Threshold Stream Processor for
Energy-Constrained Parallel Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "While Moore's law scaling continues to double
transistor density every technology generation, supply
voltage reduction has essentially stopped, increasing
both power density and total energy consumed in
conventional microprocessors. Therefore, future
processors will require an architecture that can: (a)
take advantage of the massive amount of transistors
that will be available; and (b) operate these
transistors in the near-threshold supply domain,
thereby achieving near optimal energy/computation by
balancing the leakage and dynamic energy consumption.
Unfortunately, this optimality is typically achieved
while running at very low frequencies (i.e.,
0.1--10MHz) and with only one computation executing per
cycle, such that performance is limited. Further,
near-threshold designs suffer from severe process
variability that can introduce extremely large delay
variations. In this paper, we propose a near
energy-optimal, stream processor family that relies on
massively parallel, near-threshold VLSI circuits and
interconnect, incorporating cooperative
circuit/architecture techniques to tolerate the
expected large delay variations. Initial estimations
from circuit simulations show that it is possible to
achieve greater than 1 Giga-Operations per second
(1GOP/s) with less than 1mW total power consumption,
enabling a new class of energy-constrained,
high-throughput computing applications.",
acknowledgement = ack-nhfb,
affiliation = "Krimer, E (Reprint Author), UT Austin, ECE, Austin, TX
USA. Krimer, Evgeni; Erez, Mattan, UT Austin, ECE,
Austin, TX USA. Pawlowski, Robert; Chiang, Patrick,
Oregon State Univ, EECS, Corvallis, OR 97331 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Circuits; Computer architecture; conventional
microprocessors; Delay; double transistor density;
dynamic energy consumption; energy constrained parallel
applications; Energy consumption; etc.; Frequency;
impact of technology trends; Low-power design;
Microprocessors; Mobile processors; Moore's Law; near
threshold stream processor; optimisation; parallel
programming; Physically aware micro-architecture:
power; pipeline processing; Power generation; SIMD
processors; supply voltage reduction; Synctium;
thermal; Very large scale integration; VLSI circuits;
Voltage",
keywords-plus = "CIRCUITS; TOLERANCE; CMOS",
number-of-cited-references = "19",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "22",
unique-id = "Krimer:2010:SNT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hilton:2010:SDE,
author = "Andrew Hilton and Amir Roth",
title = "{SMT-Directory}: Efficient Load-Load Ordering for
{SMT}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Memory models like SC, TSO, and PC enforce load-load
ordering, requiring that loads from any single thread
appear to occur in program order to all other threads.
Out-of-order execution can violate load-load ordering.
Conventional multi-processors with out-of-order cores
detect load-load ordering violations by snooping an
age-ordered load queue on cache invalidations or
evictions-events that act as proxies for the completion
of remote stores. This mechanism becomes less efficient
in an SMT processor, as every completing store must
search the loads queue segments of all other threads.
This inefficiency exists because store completions from
other threads in the same core are not filtered by the
cache and coherence protocol: thread 0 observes all of
thread 1's stores, not only the first store to every
cache line. SMT-Directory eliminates this overhead by
implementing the filtering traditionally provided by
the cache in the cache itself. SMT-Directory adds a
per-thread ``read'' bit to every data cache line. When
a load executes, it sets the bit corresponding to its
thread. When a store completes and write to the cache,
it checks the SMT-Directory bits of its cache line and
searches the load queue segments only of those threads
whose bits are set. As a result, local store
completions trigger searches only for data that is
actually shared.",
acknowledgement = ack-nhfb,
affiliation = "Hilton, A (Reprint Author), Univ Penn, Philadelphia,
PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn,
Philadelphia, PA 19104 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0541292]",
funding-text = "We thank Arun Raghavan for the address traces and Milo
Martin for comments on early versions of this work. The
anonymous reviewers provided valuable feedback. This
work was supported by NSF award CCF-0541292.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "age-ordered load queue; Buffer storage; cache
invalidations; cache protocol; cache storage; coherence
protocol; consistency models; data cache line;
directory; Filtering; Load modeling; load queue search;
load queue segments; load-load ordering; Memory
hierarchy; multi-threading; multiprocessing systems;
Multithreaded processors; Multithreading; Out of order;
Protocols; Read-write memory; Simultaneous
multithreading; SMT processor; Surface-mount
technology; Writing",
keywords-plus = "CONSISTENCY",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hilton:2010:SDE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hammoud:2010:DPA,
author = "Mohammad Hammoud and Sangyeun Cho and Rami G. Melhem",
title = "A Dynamic Pressure-Aware Associative Placement
Strategy for Large Scale Chip Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes dynamic pressure-aware
associative placement (DPAP), a novel distributed cache
management scheme for large-scale chip multiprocessors.
Our work is motivated by the large non-uniform
distribution of memory accesses across cache sets in
different L2 banks. DPAP decouples the physical
locations of cache blocks from their addresses for the
sake of reducing misses caused by destructive
interferences. Temporal pressure at the on-chip
last-level cache, is continuously collected at a group
(comprised of local cache sets) granularity, and
periodically recorded at the memory controller(s) to
guide the placement process. An incoming block is
consequently placed at a cache group that exhibits the
minimum pressure. Simulation results using a
full-system simulator demonstrate that DPAP outperforms
the baseline shared NUCA scheme by an average of 8.3\%
and by as much as 18.9\% for the benchmark programs we
examined. Furthermore, evaluations showed that DPAP
outperforms related cache designs.",
acknowledgement = ack-nhfb,
affiliation = "Hammoud, M (Reprint Author), Univ Pittsburgh, Dept
Comp Sci, Pittsburgh, PA 15260 USA. Hammoud, Mohammad;
Cho, Sangyeun; Melhem, Rami G., Univ Pittsburgh, Dept
Comp Sci, Pittsburgh, PA 15260 USA.",
author-email = "mhh@cs.pitt.edu cho@cs.pitt.edu melhem@cs.pitt.edu",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0952273]",
funding-text = "This work was supported in part by NSF grant
CCF-0952273.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Aggregate Cache Sets; Aggregates; Associative
Placement; cache storage; Chip Multiprocessors;
Computer architecture; Computer science; destructive
interferences; distributed cache management; DPAP;
dynamic pressure aware associative placement strategy;
Interference; large scale chip multiprocessors;
Large-scale systems; Local Cache Sets; memory access
distribution; memory controllers; microprocessor chips;
Network-on-a-chip; NUCA scheme; Pressure control;
Pressure-Aware Placement; Random access memory",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Hammoud:2010:DPA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2010:LUC,
author = "Hyungjun Kim and Paul V. Gratz",
title = "Leveraging Unused Cache Block Words to Reduce Power in
{CMP} Interconnect",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power is of paramount importance in modern computer
system design. In particular, the cache interconnect in
future CMP designs is projected to consume up to half
of the system power for cache fills and spills [8].
Despite the power consumed by spills and fills, a
significant percentage of each cache line is unused
prior to eviction from the cache. If unused cache block
words can be identified, this information can be used
to improve CMP interconnect power and energy
consumption. We propose a new method of CMP
interconnect packet composition, leveraging unused data
to reduce power. These methods are well suited to
interconnection networks with high-bandwidth wires, and
do not require expensive multi-ported memory systems.
Assuming perfect prediction, our techniques achieve an
average of similar to 37\% savings in total dynamic
link power consumption. With our current best
prediction mechanism, our techniques reduce dynamic
power consumption by similar to 23\% on average.",
acknowledgement = ack-nhfb,
affiliation = "Kim, H (Reprint Author), Texas A\&M Univ, Dept Elect
\& Comp Engn, College Stn, TX 77843 USA. Kim, Hyungjun;
Gratz, Paul V., Texas A\&M Univ, Dept Elect \& Comp
Engn, College Stn, TX 77843 USA.",
author-email = "hyungjuk@tamu.edu pgratz@tamu.edu",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; cache fills; cache interconnect; Cache
memories; cache spills; cache storage; CMP
interconnect; computer system design; Delay; dynamic
power; Energy consumption; energy consumption; flit
encoding; integrated circuit design; Interconnection
architectures; Low-power design; memory system;
microprocessor chips; Multicore; Multiprocessor
interconnection networks; Network-on-a-chip; NoC; power
aware computing; Power engineering computing; power
reduction; Power system interconnection; Random access
memory; total dynamic link power consumption; unused
cache block words; Very large scale integration;
Wires",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2010:LUC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2010:EBCa,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:FCa,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:IAa,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2010:ELE,
author = "K. Skadron",
title = "Editorial: Letter from the {Editor-in-Chief}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "37--44",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2010:U,
author = "Kevin Skadron",
title = "Untitled",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "37--44",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2010:U",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Iqbal:2010:POS,
author = "Syed Muhammad Zeeshan Iqbal and Yuchen Liang and Hakan
Grahn",
title = "{ParMiBench} --- an Open-Source Benchmark for Embedded
Multiprocessor Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multicore processors are the main computing platform
in laptops, desktop, and servers today, and are making
their way into the embedded systems market also. Using
benchmarks is a common approach to evaluate the
performance of a system. However, benchmarks for
embedded systems have so far been either targeted for a
uni-processor environment, e.g., MiBench, or have been
commercial, e.g., MultiBench by EEMBC. In this paper,
we propose and implement an open source benchmark,
ParMiBench, targeted for multiprocessor-based embedded
systems. ParMiBench consists of parallel
implementations of seven compute intensive algorithms
from the uni-processor benchmark suite MiBench. The
applications are selected from four domains: Automation
and Industry Control, Network, Office, and Security.",
acknowledgement = ack-nhfb,
affiliation = "Iqbal, SMZ (Reprint Author), Blekinge Inst Technol,
Sch Comp, SE-37179 Karlskrona, Sweden. Iqbal, Syed
Muhammad Zeeshan; Liang, Yuchen; Grahn, Hakan, Blekinge
Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden.",
author-email = "mzeeshan01@gmail.com yuchen9760@gmail.com
hakan.grahn@bth.se",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "benchmark testing; Benchmark testing; Concurrent
Programming; desktop; embedded multiprocessor system;
Embedded system; embedded system market; embedded
systems; intensive algorithm; laptop; Load management;
Multicore processing; multiprocessing systems;
Multiprocessor Systems; open-source benchmark; parallel
architectures; parallel implementation; ParMiBench;
Performance Evaluation; Performance evaluation;
Performance Evaluation; Program processors; public
domain software; Security; uniprocessor benchmark
suite",
number-of-cited-references = "9",
ORCID-numbers = "Grahn, Hakan/0000-0001-9947-1088",
research-areas = "Computer Science",
researcherid-numbers = "Grahn, Hakan/G-9720-2011",
times-cited = "32",
unique-id = "Iqbal:2010:POS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Fang:2010:BRP,
author = "Zhen Fang and Erik G. Hallnor and Bin Li and Michael
Leddige and Donglai Dai and Seung Eun Lee and Srihari
Makineni and Ravi Iyer",
title = "{Boomerang}: Reducing Power Consumption of Response
Packets in {NoCs} with Minimal Performance Impact",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Most power reduction mechanisms for NoC channel
buffers rely on on-demand wakeup to transition from a
low-power state to the active state. Two drawbacks of
on-demand wakeup limit its effectiveness: (1)
performance impact caused by wakeup delays, and (2)
energy and area cost of sleep circuitry itself. What
makes the problem harder to solve is that solutions to
either problem tend to exacerbate the other. For
example, faster wakeup from a power-gated state
requires greater charge/discharge current for the sleep
transistors while using nimbler sleep transistors
implies long wakeup delays. As a result, powerdowns
have to be conservatively prescribed, missing many
power-saving opportunities. We propose Boomerang, a
novel power-saving method that overcomes the above
drawbacks. Specifically, based on the observation that
a response is always preceded by a request, we let the
request trigger wakeup of the buffer that is to be used
by its response in the ( near) future, instead of using
on-demand wakeups. Hiding the wakeup delay completely,
Boomerang allows us to employ aggressive sleep policies
and use low-cost power gating circuits on response
buffers.",
acknowledgement = ack-nhfb,
affiliation = "Fang, Z (Reprint Author), Intel Corp, Santa Clara, CA
95051 USA. Fang, Zhen; Hallnor, Erik G.; Li, Bin;
Leddige, Michael; Dai, Donglai; Makineni, Srihari;
Iyer, Ravi, Intel Corp, Santa Clara, CA 95051 USA. Lee,
Seung Eun, Seoul Natl Univ Sci \& Technol, Seoul, South
Korea.",
author-email = "zhen.fang@intel.com",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Boomerang; buffer circuits; charge-discharge current;
Delay; Interconnection networks; Leakage currents;
leakage power; low-cost power gating circuits;
low-power design; Mobile communication;
network-on-chip; nimbler sleep transistors; NoC channel
buffers; packet-switching networks; power aware
computing; power consumption reduction mechanism;
power-gated state; power-saving method; response
packets; Routing; Switches; System-on-a-chip;
Transistors; wakeup delay",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Fang:2010:BRP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lyons:2010:ASF,
author = "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
and David Brooks",
title = "The Accelerator Store framework for high-performance,
low-power accelerator-based systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware acceleration can increase performance and
reduce energy consumption. To maximize these benefits,
accelerator-based systems that emphasize computation on
accelerators (rather than on general purpose cores)
should be used. We introduce the ``accelerator store,''
a structure for sharing memory between accelerators in
these accelerator-based systems. The accelerator store
simplifies accelerator I/O and reduces area by mapping
memory to accelerators when needed at runtime.
Preliminary results demonstrate a 30\% system area
reduction with no energy overhead and less than 1\%
performance overhead in contrast to conventional DMA
schemes.",
acknowledgement = ack-nhfb,
affiliation = "Lyons, MJ (Reprint Author), Harvard Univ, Sch Engn \&
Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael J.;
Brooks, David, Harvard Univ, Sch Engn \& Appl Sci,
Cambridge, MA 02138 USA.",
author-email = "mjlyons@eecs.harvard.edu mhempstead@coe.drexel.edu
guyeon@eecs.harvard.edu dbrooks@eecs.harvard.edu",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [IIS-0926148];
Gigascale Systems Research Center",
funding-text = "This material is based upon work supported by the
National Science Foundation under Grant No.
IIS-0926148. The authors acknowledge the support of the
Gigascale Systems Research Center, one of six research
centers funded under the Focus Center Research Program
(FCRP), a Semiconductor Research Corporation entity.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerator store framework; energy
consumption; General; hardware acceleration;
Heterogeneous (hybrid) systems; high-performance
low-power accelerator-based system; low-power
electronics; memory architecture; Memory management;
memory mapping; memory sharing; Program processors;
Random access memory; Real time systems; Real-time and
embedded systems; shared memory systems; storage
management; Throughput; Transform coding",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "13",
unique-id = "Lyons:2010:ASF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manevich:2010:CAR,
author = "Ran Manevich and Israel Cidon and Avinoam Kolodny and
Isask'har Walter",
title = "Centralized Adaptive Routing for {NoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As the number of applications and programmable units
in CMPs and MPSoCs increases, the Network-on-Chip (NoC)
encounters diverse and time dependent traffic loads.
This trend motivates the introduction of NoC
load-balanced, adaptive routing mechanisms that achieve
higher throughput as compared with traditional
oblivious routing schemes that are perceived better
suited for hardware implementations. However, an
efficient adaptive routing scheme should base its
decisions on the global state of the system rather than
on local or regional congestion signals as is common in
current adaptive routing schemes. In this paper we
introduce a novel paradigm of NoC centralized adaptive
routing, and a specific design for mesh topology. Our
scheme continuously monitors the global traffic load in
the network and modifies the routing of packets to
improve load balancing accordingly. In our specific
mesh-based design, XY or YX routes are adaptively
selected for each source-destination pair. We show that
while our implementation is scalable and lightweight in
hardware costs, it outperforms distributed adaptive
routing schemes in terms of load balancing and
throughput.",
acknowledgement = ack-nhfb,
affiliation = "Manevich, R (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Manevich, Ran; Cidon, Israel; Kolodny, Avinoam; Walter,
Isask'har, Technion Israel Inst Technol, Dept Elect
Engn, IL-32000 Haifa, Israel.",
author-email = "ranman@tx.technion.ac.il cidon@ee.technion.ac.il
kolodny@ee.technion.ac.il zigi@tx.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive routing; Adaptive systems; centralized
adaptive routing; Computer architecture; distributed
adaptive routing; global state; load balanced adaptive
routing; load balancing; Load control; Load management;
mesh based design; mesh topology; network on chip;
Network on Chip; network routing; Network-on-Chip;
network-on-chip; NoC; packet routing; programmable
unit; regional congestion signal; routing algorithms;
Routing protocols; Telecommunication traffic;
Throughput; time dependent traffic load",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Manevich:2010:CAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhang:2010:FCA,
author = "Meng Zhang and Alvin R. Lebeck and Daniel J. Sorin",
title = "Fractal Consistency: Architecting the Memory System to
Facilitate Verification",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "One of the most challenging problems in developing a
multicore processor is verifying that the design is
correct, and one of the most difficult aspects of
pre-silicon verification is verifying that the memory
system obeys the architecture's specified memory
consistency model. To simplify the process of
pre-silicon design verification, we propose a system
model called the Fractally Consistent Model (FCM). We
prove that systems that adhere to the FCM can be
verified to obey the memory consistency model in three
simple, scalable steps. The procedure for verifying FCM
systems contrasts sharply with the difficult,
non-scalable procedure required to verify non-FCM
systems. We show that FCM systems do not necessarily
sacrifice performance, compared to non-FCM systems,
despite being simpler to verify.",
acknowledgement = ack-nhfb,
affiliation = "Zhang, M (Reprint Author), Duke Univ, Dept Elect \&
Comp Engn, Durham, NC 27706 USA. Zhang, Meng; Sorin,
Daniel J., Duke Univ, Dept Elect \& Comp Engn, Durham,
NC 27706 USA. Lebeck, Alvin R., Duke Univ, Dept Comp
Sci, Durham, NC 27706 USA.",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0702434,
CCF-0811290]",
funding-text = "This material is based upon work supported by the
National Science Foundation under grants CCF-0702434
and CCF-0811290.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic and Logic Structures; Coherence;
Computational modeling; Computer architecture; Computer
Reliability; Fault-Tolerance; FCM systems; Formal
verification; fractal consistent model; Fractals;
Hardware; Memory; memory architecture; Memory
Consistency; memory consistency model; Memory
hierarchy; memory system architecture;
Micro-architecture implementation considerations;
microprocessor chips; Multicore; multicore processor;
multiprocessing systems; Performance Analysis and
Design Aids; presilicon verification; Processor
Architectures; Protocols; Testing; Validation;
Verification",
number-of-cited-references = "10",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Zhang:2010:FCA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2010:AIT,
author = "Anonymous",
title = "Advertisement --- {{\booktitle{IEEE Transactions on
Computers}}} Celebrates 60 Years",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "65--65",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSb,
author = "Anonymous",
title = "2011 {IEEE Computer Society} Simulator Design
Competition",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "66--66",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ASS,
author = "Anonymous",
title = "Advertisement --- Special Student Offer",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "67--67",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ADY,
author = "Anonymous",
title = "Advertisement --- Distinguish Yourself With the
{CSDP}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "68--68",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:CPS,
author = "Anonymous",
title = "{Conference Proceedings Services (CPS)}
[advertisement]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "69--69",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSc,
author = "Anonymous",
title = "{IEEE Computer Society} Jobs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "70--70",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ASC,
author = "Anonymous",
title = "Advertisement --- Stay Connected to the {IEEE Computer
Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "71--71",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ACS,
author = "Anonymous",
title = "Advertisement --- {Computer Society Digital Library}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "72--72",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:EBCb,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:FCb,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:IAb,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSd,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2011:ELE,
author = "K. Skadron",
title = "Editorial: Letter from the {Editor-in-Chief}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "1--3",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2011:U,
author = "Kevin Skadron",
title = "Untitled",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "1--3",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2011:U",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Vandierendonck:2011:FMM,
author = "Hans Vandierendonck and Andre Seznec",
title = "Fairness Metrics for Multi-Threaded Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "4--7",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multi-threaded processors execute multiple threads
concurrently in order to increase overall throughput.
It is well documented that multi-threading affects
per-thread performance but, more importantly, some
threads are affected more than others. This is
especially troublesome for multi-programmed workloads.
Fairness metrics measure whether all threads are
affected equally. However defining equal treatment is
not straightforward. Several fairness metrics for
multi-threaded processors have been utilized in the
literature, although there does not seem to be a
consensus on what metric does the best job of measuring
fairness. This paper reviews the prevalent fairness
metrics and analyzes their main properties. Each metric
strikes a different trade-off between fairness in the
strict sense and throughput. We categorize the metrics
with respect to this property. Based on experimental
data for SMT processors, we suggest using the minimum
fairness metric in order to balance fairness and
throughput.",
acknowledgement = ack-nhfb,
affiliation = "Vandierendonck, H (Reprint Author), Univ Ghent, Dept
Elect \& Informat Syst, Ghent, Belgium. Vandierendonck,
Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent,
Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.",
author-email = "hans.vandierendonck@elis.ugent.be
Andre.Seznec@inria.fr",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Correlation; fairness; fairness metrics; Harmonic
analysis; Instruction sets; measurement; Measurement;
multi-programming; Multi-threaded processors;
multi-threading; multiprocessing systems;
multiprogrammed workloads; multithreaded processors;
Parallel Architectures; Performance of Systems;
quality-of-service; resource allocation; SMT
processors; software metrics; System-on-a-chip;
Throughput",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "13",
unique-id = "Vandierendonck:2011:FMM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tang:2011:PEM,
author = "Jie Tang and Shaoshan Liu and Zhimin Gu and Chen Liu
and Jean-Luc Gaudiot",
title = "Prefetching in Embedded Mobile Systems Can Be
Energy-Efficient",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "8--11",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Data prefetching has been a successful technique in
high-performance computing platforms. However, the
conventional wisdom is that they significantly increase
energy consumption, and thus not suitable for embedded
mobile systems. On the other hand, as modern mobile
applications pose an increasing demand for high
performance, it becomes essential to implement
high-performance techniques, such as prefetching, in
these systems. In this paper, we study the impact of
prefetching on the performance and energy consumption
of embedded mobile systems. Contrary to the
conventional wisdom, our findings demonstrate that as
technology advances, prefetching can be
energy-efficient while improving performance.
Furthermore, we have developed a simple but effective
analytical model to help system designers to identify
the conditions for energy efficiency.",
acknowledgement = ack-nhfb,
affiliation = "Tang, J (Reprint Author), Beijing Inst Technol,
Beijing 100081, Peoples R China. Tang, Jie; Gu, Zhimin,
Beijing Inst Technol, Beijing 100081, Peoples R China.
Liu, Shaoshan, Microsoft Corp, Redmond, WA 98052 USA.
Liu, Chen, Florida Int Univ, Miami, FL 33199 USA.
Gaudiot, Jean-Luc, Univ Calif Irvine, Irvine, CA USA.",
author-email = "tangjie.bit@gmail.com shaoliu@microsoft.com
zmgu@x263.net chen.liu@fiu.edu gaudiot@uci.edu",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "data prefetching; embedded mobile systems; embedded
systems; energy consumption; energy efficiency
condition; energy-efficient prefetching;
high-performance computing platform; Low power
electronics; Low-power design; Memory management;
Memory Structures; mobile computing; Mobile computing;
Mobile Computing; storage management",
number-of-cited-references = "11",
ORCID-numbers = "Liu, Chen/0000-0003-1558-6836",
research-areas = "Computer Science",
times-cited = "19",
unique-id = "Tang:2011:PEM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khan:2011:DDC,
author = "Omer Khan and Mieszko Lis and Yildiz Sinangil and
Srinivas Devadas",
title = "{DCC}: a Dependable Cache Coherence Multicore
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "12--15",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cache coherence lies at the core of
functionally-correct operation of shared memory
multicores. Traditional directory-based hardware
coherence protocols scale to large core counts, but
they incorporate complex logic and directories to track
coherence states. Technology scaling has reached
miniaturization levels where manufacturing
imperfections, device unreliability and occurrence of
hard errors pose a serious dependability challenge.
Broken or degraded functionality of the coherence
protocol can lead to a non-operational processor or
user visible performance loss. In this paper, we
propose a dependable cache coherence architecture (DCC)
that combines the traditional directory protocol with a
novel execution-migration-based architecture to ensure
dependability that is transparent to the programmer.
Our architecturally redundant execution migration
architecture only permits one copy of data to be cached
anywhere in the processor: when a thread accesses an
address not locally cached on the core it is executing
on, it migrates to the appropriate core and continues
execution there. Both coherence mechanisms can co-exist
in the DCC architecture and we present architectural
extensions to seamlessly transition between the
directory and execution migration protocols.",
acknowledgement = ack-nhfb,
affiliation = "Khan, O (Reprint Author), MIT, 77 Massachusetts Ave,
Cambridge, MA 02139 USA. Khan, Omer; Lis, Mieszko;
Sinangil, Yildiz; Devadas, Srinivas, MIT, Cambridge, MA
02139 USA. Khan, Omer, Univ Massachusetts, Lowell, MA
USA.",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecturally redundant execution migration
architecture; B.3.4 Reliability, Testing, and
Fault-Tolerance; B.8 Performance and Reliability;
broken functionality; C.4.b Fault tolerance; cache
coherence; cache storage; Coherence; coherence
mechanisms; coherence states; DCC architecture;
degraded functionality; dependability challenge;
Dependable architecture; dependable cache coherence
architecture; dependable cache coherence multicore
architecture; device unreliability; directory protocol;
directory-based hardware coherence protocols;
execution-migration-based architecture;
functionally-correct operation; Hardware; incorporate
complex logic; Instruction sets; large core counts;
manufacturing imperfections; memory architecture;
memory protocols; microprocessor chips; miniaturization
levels; Multicore processing; multicores;
nonoperational processor; Protocols; shared memory
multicores; shared memory systems; System-on-a-chip;
technology scaling; user visible performance loss",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Khan:2011:DDC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rosenfeld:2011:DCA,
author = "Paul Rosenfeld and Elliott Cooper-Balis and Bruce
Jacob",
title = "{DRAMSim2}: a Cycle Accurate Memory System Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "16--19",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper we present DRAMSim2, a cycle accurate
memory system simulator. The goal of DRAMSim2 is to be
an accurate and publicly available DDR2/3 memory system
model which can be used in both full system and
trace-based simulations. We describe the process of
validating DRAMSim2 timing against manufacturer Verilog
models in an effort to prove the accuracy of simulation
results. We outline the combination of DRAMSim2 with a
cycle-accurate x86 simulator that can be used to
perform full system simulations. Finally, we discuss
DRAMVis, a visualization tool that can be used to graph
and compare the results of DRAMSim2 simulations.",
acknowledgement = ack-nhfb,
affiliation = "Rosenfeld, P (Reprint Author), Univ Maryland, Dept
Elect \& Comp Engn, College Pk, MD 20742 USA.
Rosenfeld, Paul; Cooper-Balis, Elliott; Jacob, Bruce,
Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD
20742 USA.",
author-email = "prosenf1@umd.edu ecc17@umd.edu blj@umd.edu",
da = "2019-06-20",
doc-delivery-number = "773ZN",
eissn = "1556-6064",
esi-highly-cited-paper = "Y",
esi-hot-paper = "N",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; cycle accurate memory system
simulator; DDR2/3 memory system model; DRAM; DRAM
chips; DRAMSim2 simulation; DRAMSim2 timing; Driver
circuits; Hardware design languages; Load modeling;
memory architecture; memory cards; Object oriented
modeling; Primary memory; Random access memory;
Simulation; Timing; trace-based simulation; Verilog
model; visualization tool",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "270",
unique-id = "Rosenfeld:2011:DCA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gou:2011:ESH,
author = "Chunyang Gou and Georgi N. Gaydadjiev",
title = "Exploiting {SPMD} Horizontal Locality",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "20--23",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we analyze a particular spatial
locality case (called horizontal locality) inherent to
manycore accelerator architectures employing barrel
execution of SPMD kernels, such as GPUs. We then
propose an adaptive memory access granularity framework
to exploit and enforce the horizontal locality in order
to reduce the interferences among accelerator cores
memory accesses and hence improve DRAM efficiency. With
the proposed technique, DRAM efficiency grows by 1.42X
on average, resulting in 12.3\% overall performance
gain, for a set of representative memory intensive
GPGPU applications.",
acknowledgement = ack-nhfb,
affiliation = "Gou, C (Reprint Author), Delft Univ Technol, NL-2600
AA Delft, Netherlands. Gou, Chunyang; Gaydadjiev,
Georgi N., Delft Univ Technol, NL-2600 AA Delft,
Netherlands.",
author-email = "c.gou@tudelft.nl g.n.gaydadjiev@tudelft.nl",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator core memory access; adaptive memory access
granularity; Bandwidth; barrel execution; DRAM chips;
DRAM efficiency; GPU; Graphics processing unit;
Instruction sets; interference; Kernel; manycore
accelerator architecture; Memory hierarchy;
microprocessor chips; Multi-core/single-chip
multiprocessors; parallel architectures; Pipelines;
Proposals; Random access memory; SIMD processors;
single program multiple data; spatial locality; SPMD
horizontal locality; SPMD kernel",
number-of-cited-references = "13",
ORCID-numbers = "Gaydadjiev, Georgi/0000-0002-3678-7007",
research-areas = "Computer Science",
researcherid-numbers = "Gaydadjiev, Georgi/F-1488-2010",
times-cited = "1",
unique-id = "Gou:2011:ESH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2011:GGC,
author = "Xiaoqun Wang and Zhenzhou Ji and Chen Fu and Mingzeng
Hu",
title = "{GCMS}: a Global Contention Management Scheme in
Hardware Transactional Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "24--27",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware Transactional Memory (HTM) is a promising
Transactional Memory (TM) implementation because of its
strong atomicity and high performance. Unfortunately,
most contention management approaches in HTMs are
dedicated to specific transaction conflict scenarios
and it is hard to choose a universal strategy for
different workloads. In addition, HTM performance
degrades sharply when there are severe transaction
conflicts. In this paper, we present a Global
Contention Management Scheme (GCMS) to resolve severe
transaction conflicts in HTMs. Our scheme depends on a
Deadlock and Livelock Detection Mechanism (DLDM) and a
Global Contention Manager (GCM) to resolve severe
transaction conflicts. This scheme is orthogonal to the
rest of the contention management policies. We have
incorporated GCMS into different HTMs and compared the
performance of the enhanced systems with that of the
original HTMs with the STAMP benchmark suite. The
results demonstrate that the performance of the
enhanced HTMs is improved.",
acknowledgement = ack-nhfb,
affiliation = "Wang, XQ (Reprint Author), Harbin Inst Technol, Sch
Comp Sci, Harbin 150006, Peoples R China. Wang,
Xiaoqun; Ji, Zhenzhou; Fu, Chen; Hu, Mingzeng, Harbin
Inst Technol, Sch Comp Sci, Harbin 150006, Peoples R
China.",
author-email = "wxiaoqun@gmail.com",
da = "2019-06-20",
doc-delivery-number = "773ZN",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bioinformatics; Concurrent Programming; Contention
Management; deadlock-and-livelock detection mechanism;
GCMS scheme; Genomics; global contention management
scheme; global contention manager; Hardware; Hardware
Transactional Memory; hardware transactional memory;
Multi-core/single-chip multiprocessors; Multicore
Processors; Parallel Programming; Program processors;
Radiation detectors; storage management; System
recovery; transaction conflict; transaction
processing",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wang:2011:GGC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2011:RL,
author = "Anonymous",
title = "2010 Reviewers List",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "28--28",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "IEEE publishing",
}
@Article{Anonymous:2011:AI,
author = "Anonymous",
title = "2010 Annual Index",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "??--??",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:Ca,
author = "Anonymous",
title = "Cover 2",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:Cb,
author = "Anonymous",
title = "Cover 3",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:Cc,
author = "Anonymous",
title = "Cover 4",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:FCa,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mars:2011:HHW,
author = "Jason Mars and Lingjia Tang and Robert Hundt",
title = "Heterogeneity in {``Homogeneous''} Warehouse-Scale
Computers: a Performance Opportunity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "29--32",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The class of modern datacenters recently coined as
``warehouse scale computers'' (WSCs) has traditionally
been embraced as homogeneous computing platforms.
However, due to frequent machine replacements and
upgrades, modern WSCs are in fact composed of diverse
commodity microarchitectures and machine
configurations. Yet, current WSCs are designed with an
assumption of homogeneity, leaving a potentially
significant performance opportunity unexplored. In this
paper, we investigate the key factors impacting the
available heterogeneity in modern WSCs, and the benefit
of exploiting this heterogeneity to maximize overall
performance. We also introduce a new metric,
opportunity factor, which can be used to quantify an
application's sensitivity to the heterogeneity in a
given WSC. For applications that are sensitive to
heterogeneity, we observe a performance improvement of
up to 70\% when employing our approach. In a WSC
composed of state-of-the-art machines, we can improve
the overall performance of the entire datacenter by
16\% over the status quo.",
acknowledgement = ack-nhfb,
affiliation = "Mars, J (Reprint Author), Univ Virginia,
Charlottesville, VA 22903 USA. Mars, Jason; Tang,
Lingjia, Univ Virginia, Charlottesville, VA 22903
USA.",
author-email = "jom5x@cs.virginia.edu lt8f@cs.virginia.edu
rhundt@google.com",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; computer
centres; datacenters; Design studies; Distributed
architectures; diverse commodity microarchitectures;
Heterogeneous (hybrid) systems; homogeneous
warehouse-scale computers; integration and modeling;
machine configurations; mainframes; Microarchitecture;
Optimization; Scheduling and task partitioning; Super
(very large) computers; System architectures",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "22",
unique-id = "Mars:2011:HHW",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Michelogiannakis:2011:PCE,
author = "George Michelogiannakis and Nan Jiang and Daniel U.
Becker and William J. Dally",
title = "Packet Chaining: Efficient Single-Cycle Allocation for
On-Chip Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "33--36",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper introduces packet chaining, a simple and
effective method to increase allocator matching
efficiency and hence network performance, particularly
suited to networks with short packets and short cycle
times. Packet chaining operates by chaining packets
destined to the same output together, to reuse the
switch connection of a departing packet. This allows an
allocator to build up an efficient matching over a
number of cycles, like incremental allocation, but not
limited by packet length. For a 64-node 2D mesh at
maximum injection rate and with single-flit packets,
packet chaining increases network throughput by 15\%
compared to a conventional single-iteration separable
iSLIP allocator, outperforms a wavefront allocator, and
gives comparable throughput with an augmenting paths
allocator. Packet chaining achieves this performance
with a cycle time comparable to a single-iteration
separable allocator. Packet chaining also reduces
average network latency by 22.5\% compared to iSLIP.
Finally, packet chaining increases IPC up to 46\% (16\%
average) for application benchmarks because short
packets are critical in a typical cache-coherent CMP.
These are considerable improvements given the maturity
of network-on-chip routers and allocators.",
acknowledgement = ack-nhfb,
affiliation = "Michelogiannakis, G (Reprint Author), Stanford Univ,
Stanford, CA 94305 USA. Michelogiannakis, George;
Jiang, Nan; Becker, Daniel U.; Dally, William J.,
Stanford Univ, Stanford, CA 94305 USA.",
author-email = "mihelog@stanford.edu njiang37@stanford.edu
dub@stanford.edu dally@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0702341];
National Security Agency [H98230-08-C-0272-P007];
Robert Bosch Fellowship; Prof. Michael Farmwald
Fellowship; Prof. Michael J. Flynn Stanford Graduate
Fellowship",
funding-text = "This work was supported in part by the National
Science Foundation under Grant CCF-0702341, in part by
the National Security Agency under Contract
H98230-08-C-0272-P007 and in part by the Robert Bosch,
Prof. Michael Farmwald and Prof. Michael J. Flynn
Stanford Graduate Fellowships.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "allocator matching efficiency; Benchmark testing;
Interconnection architectures; network performance;
network-on-chip; network-on-chip routers; On-chip
interconnection networks; on-chip networks; packet
chaining; Resource management; single-iteration
separable iSLIP allocator; System-on-a-chip;
Throughput",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Michelogiannakis:2011:PCE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ho:2011:EIB,
author = "Chen-Han Ho and Garret Staus and Aaron Ulmer and
Karthikeyan Sankaralingam",
title = "Exploring the Interaction Between Device Lifetime
Reliability and Security Vulnerabilities",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "37--40",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As technology scales, device reliability is becoming a
fundamental problem. Even though manufacture test can
guarantee product quality, due to various types of
wearout and failure modes, permanent faults appearing
in the filed is becoming an increasingly important and
real problem. Such types of wear-out creates permanent
faults in devices after release to the user during
their lifetime. In this paper, we perform a formal
investigation of the impact of permanent faults on
security, examine empirical evidence, and demonstrate a
real attack. Our results show that permanent stuck-at
faults may leave security holes in microprocessors. We
show that an adversary with knowledge of a fault can
launch attacks which can obtain critical secrets such
as a private key in 30 seconds.",
acknowledgement = ack-nhfb,
affiliation = "Ho, CH (Reprint Author), Univ Wisconsin, Madison, WI
53706 USA. Ho, Chen-Han; Staus, Garret; Ulmer, Aaron;
Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI
53706 USA.",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic and Logic Structures; Circuit faults;
Computer bugs; Control Structures and Microprogramming;
Cryptography; device lifetime reliability; failure
mode; fault tolerant computing; Hardware reliability;
Logic programming; microprocessor chips;
microprocessors; Permanent Fault; permanent fault;
private key; product quality; Program processors;
public key cryptography; Reliability; Reliability
engineering; Security; security vulnerability; wear-out
type; wearout mode",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Ho:2011:EIB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hernandez:2011:FTV,
author = "Carles Hernandez and Antoni Roca and Jose Flich and
Federico Silla and Jose Duato",
title = "Fault-Tolerant Vertical Link Design for Effective {3D}
Stacking",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "41--44",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Recently, 3D stacking has been proposed to alleviate
the memory bandwidth limitation arising in chip
multiprocessors (CMPs). As the number of integrated
cores in the chip increases the access to external
memory becomes the bottleneck, thus demanding larger
memory amounts inside the chip. The most accepted
solution to implement vertical links between stacked
dies is by using Through Silicon Vias (TSVs). However,
TSVs are exposed to misalignment and random defects
compromising the yield of the manufactured 3D chip. A
common solution to this problem is by
over-provisioning, thus impacting on area and cost. In
this paper, we propose a fault-tolerant vertical link
design. With its adoption, fault-tolerant vertical
links can be implemented in a 3D chip design at low
cost without the need of adding redundant TSVs (no
over-provision). Preliminary results are very promising
as the fault-tolerant vertical link design increases
switch area only by 6.69\% while the achieved
interconnect yield tends to 100\%.",
acknowledgement = ack-nhfb,
affiliation = "Hernandez, C (Reprint Author), Univ Politecn Valencia,
C Cami de Vera S-N, Valencia 46022, Spain. Hernandez,
Carles; Roca, Antoni; Flich, Jose; Silla, Federico;
Duato, Jose, Univ Politecn Valencia, Valencia 46022,
Spain.",
author-email = "carherlu@gap.upv.es",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish MEC; MICINN; European Commission
[CSD2006-00046, TIN2009-14475-C04]; NaNoC [248972]",
funding-text = "This work was supported by the Spanish MEC and MICINN,
as well as European Commission FEDER funds, under
Grants CSD2006-00046 and TIN2009-14475-C04. It was also
partly supported by the project NaNoC (project label
248972) which is funded by the European Commission
within the Research Programme FP7.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D chip; 3D Stacking; 3D stacking; chip
multiprocessors; CMP; effective 3D stacking; external
memory; Fault Tolerance; fault tolerance; Fault
Tolerance; Fault tolerant systems; fault-tolerant
vertical link design; memory bandwidth limitation;
Memory management; microprocessor chips;
network-on-chip; NoC; Stacking; storage management
chips; Three dimensional displays; three-dimensional
integrated circuits; through silicon vias; TSV",
number-of-cited-references = "20",
oa = "Green Published",
ORCID-numbers = "Silla, Federico/0000-0002-6435-1200 Hernandez,
Carles/0000-0001-5393-3195",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Hernandez:2011:FTV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Choi:2011:EID,
author = "Inseok Choi and Minshu Zhao and Xu Yang and Donald
Yeung",
title = "Experience with Improving Distributed Shared Cache
Performance on {Tilera}'s {Tile} Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes our experience with profiling and
optimizing physical locality for the distributed shared
cache (DSC) in Tilera's Tile multicore processor. Our
approach uses the Tile Processor's hardware performance
measurement counters (PMCs) to acquire page-level
access pattern profiles. A key problem we address is
imprecise PMC interrupts. Our profiling tools use
binary analysis to correct for interrupt ``skid'', thus
pinpointing individual memory operations that incur
remote DSC slice references and permitting us to sample
their access patterns. We use our access pattern
profiles to drive page homing optimizations for both
heap and static data objects. Our experiments show we
can improve physical locality for 5 out of 11 SPLASH2
benchmarks running on 32 cores, enabling 32.9\%-77.9\%
of DSC references to target the local DSC slice. To our
knowledge, this is the first work to demonstrate page
homing optimizations on a real system.",
acknowledgement = ack-nhfb,
affiliation = "Choi, I (Reprint Author), Univ Maryland, Dept Elect \&
Comp Engn, College Pk, MD 20742 USA. Choi, Inseok;
Zhao, Minshu; Yang, Xu; Yeung, Donald, Univ Maryland,
Dept Elect \& Comp Engn, College Pk, MD 20742 USA.",
author-email = "inseok@umd.edu mszhao@umd.edu yangxu@umd.edu
yeung@umd.edu",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; binary analysis; cache storage;
Computer architecture; Data streams; Design
methodology; Design studies; distributed shared cache
performance; hardware performance measurement counters;
microprocessor chips; Multi-core/single-chip
multiprocessors; Multicore processing; Multiple Data
Stream Architectures (Multiprocessors); multiprocessing
systems; Multiprocessing systems; page homing
optimization; page-level access pattern profile; PMC
interrupt; profiling tool; Tilera tile multicore
processor",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Choi:2011:EID",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Prieto:2011:MCM,
author = "Pablo Prieto and Valentin Puente and Jose-Angel
Gregorio",
title = "Multilevel Cache Modeling for Chip-Multiprocessor
Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper presents a simple analytical model for
predicting on-chip cache hierarchy effectiveness in
chip multiprocessors (CMP) for a state-of-the-art
architecture. Given the complexity of this type of
systems, we use rough approximations, such as the
empirical observation that the re-reference timing
pattern follows a power law and the assumption of a
simplistic delay model for the cache, in order to
provide a useful model for the memory hierarchy
responsiveness. This model enables the analytical
determination of average access time, which makes
design space pruning useful before sweeping the vast
design space of this class of systems. The model is
also useful for predicting cache hierarchy behavior in
future systems. The fidelity of the model has been
validated using a state-of-the-art, full-system
simulation environment, on a system with up to sixteen
out-of-order processors with cache-coherent caches and
using a broad spectrum of applications, including
complex multithread workloads. This simple model can
predict a near-to-optimal, on-chip cache distribution
while also estimating how future systems running future
applications might behave.",
acknowledgement = ack-nhfb,
affiliation = "Prieto, P (Reprint Author), Univ Cantabria, Cantabria,
Spain. Prieto, Pablo; Puente, Valentin; Gregorio,
Jose-Angel, Univ Cantabria, Cantabria, Spain.",
author-email = "prietop@unican.es vpuente@unican.es
monaster@unican.es",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Innovation
[TIN2010-18159]; HiPEAC2 European Network of
Excellence",
funding-text = "This work has been supported by the Spanish Ministry
of Science and Innovation, under contracts
TIN2010-18159, and by the HiPEAC2 European Network of
Excellence. The authors would like to thank the
reviewers for their valuable comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "approximation theory; cache hierarchy behavior
prediction; cache storage; Cache storage;
cache-coherent caches; chip-multiprocessor systems;
complex multithread workloads; Complexity theory;
Computational modeling; design space; integrated
circuit design; Memory hierarchy; memory hierarchy
responsiveness; microprocessor chips;
Multi-core/single-chip multiprocessors; multilevel
cache modeling; multiprocessing systems;
Multiprocessing systems; near-to-optimal on-chip cache
distribution; on-chip cache hierarchy effectiveness
prediction; power law; re-reference timing pattern;
rough approximations; simplistic delay model
assumption; Software tools; Thermal analysis; Thermal
sensors",
number-of-cited-references = "13",
ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente,
Valentin/0000-0002-6904-3282 Gregorio, Jose
Angel/0000-0003-2214-303X",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Prieto:2011:MCM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Siozios:2011:SRT,
author = "Kostas Siozios and Dimitrios Rodopoulos and Dimitrios
Soudris",
title = "On Supporting Rapid Thermal Analysis",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Detailed thermal analysis is usually performed
exclusively at design time since it is a
computationally intensive task. In this paper, we
introduce a novel methodology for fast, yet accurate,
thermal analysis. The introduced methodology is
software supported by a new open source tool that
enables hierarchical thermal analysis with adaptive
levels of granularity. Experimental results prove the
efficiency of our approach since it leads to average
reduction of the execution overhead up to 70\% with a
penalty in accuracy ranging between 2\% and 8\%.",
acknowledgement = ack-nhfb,
affiliation = "Siozios, K (Reprint Author), Natl Tech Univ Athens,
Sch ECE, GR-10682 Athens, Greece. Siozios, Kostas;
Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
Univ Athens, Sch ECE, GR-10682 Athens, Greece.",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Complexity theory; Computational modeling; Computer
Systems Organization; Design Methodologies; General;
Hardware; hierarchical thermal analysis; Modeling
techniques; Monitoring; open source tool; Performance
of Systems; Power Management; public domain software;
rapid thermal analysis; Reconfigurable Hardware;
Reconfigurable hardware; Reliability; software
engineering; software supported; Software tools;
thermal analysis; Thermal analysis; Thermal Monitoring;
Thermal sensors",
number-of-cited-references = "8",
ORCID-numbers = "Siozios, Kostas/0000-0002-0285-2202 Soudris,
Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/I-5252-2014 Siozios,
Kostas/F-9726-2011 Soudris, Dimitrios/O-8843-2019",
times-cited = "3",
unique-id = "Siozios:2011:SRT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2011:Cd,
author = "Anonymous",
title = "Cover 3",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:FCb,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:ICS,
author = "Anonymous",
title = "{IEEE Computer Society} [society information]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:PI,
author = "Anonymous",
title = "Publication information",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sethumadhavan:2012:CHD,
author = "Simha Sethumadhavan and Ryan Roberts and Yannis
Tsividis",
title = "A Case for Hybrid Discrete-Continuous Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Current technology trends indicate that power- and
energy-efficiency will limit chip throughput in the
future. Current solutions to these problems, either in
the way of programmable or fixed-function digital
accelerators will soon reach their limits as
microarchitectural overheads are successively trimmed.
A significant departure from current computing methods
is required to carry forward computing advances beyond
digital accelerators. In this paper we describe how the
energy-efficiency of a large class of problems can be
improved by employing a hybrid of the discrete and
continuous models of computation instead of the
ubiquitous, traditional discrete model of computation.
We present preliminary analysis of domains and
benchmarks that can be accelerated with the new model.
Analysis shows that machine learning, physics and up to
one-third of SPEC, RMS and Berkeley suite of
applications can be accelerated with the new hybrid
model.",
acknowledgement = ack-nhfb,
affiliation = "Sethumadhavan, S (Reprint Author), Columbia Univ, New
York, NY 10027 USA. Sethumadhavan, Simha; Roberts,
Ryan; Tsividis, Yannis, Columbia Univ, New York, NY
10027 USA.",
author-email = "simha@cs.columbia.edu",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "DARPA; AFRL [FA8750-10-2-0253,
FA9950-09-1-0389]; NSF",
funding-text = "Sethumadhavan's research is funded by grants from
DARPA, AFRL (FA8750-10-2-0253, FA9950-09-1-0389), the
NSF CAREER program, gifts from Microsoft Research and
Columbia University, and software donations from
Synopsys and Wind River. Roberts conducted this
research as a GRA in Sethumadhavan's Lab.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Approximation algorithms; Benchmark testing; Berkeley
suite; Computational modeling; Computer architecture;
computer architecture; Computer architecture; computer
architecture; computing methods; continuous models;
cryptography; Design studies; Differential equations;
discrete model; discrete models; domains analysis;
energy conservation; energy-efficiency; fixed-function
digital accelerators; forward computing advances;
hybrid discrete-continuous architectures; Hybrid
systems; machine learning; Mathematical model;
microarchitectural overheads; microprocessor chips;
power-efficiency; Processor architectures; RMS; SPEC;
Very large scale integration",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Sethumadhavan:2012:CHD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kong:2012:ASF,
author = "Ji Kong and Peilin Liu and Yu Zhang",
title = "Atomic Streaming: a Framework of On-Chip Data Supply
System for Task-Parallel {MPSoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "State of the art fabrication technology for
integrating numerous hardware resources such as
Processors/DSPs and memory arrays into a single chip
enables the emergence of Multiprocessor System-on-Chip
(MPSoC). Stream programming paradigm based on MPSoC is
highly efficient for single functionality scenario due
to its dedicated and predictable data supply system.
However, when memory traffic is heavily shared among
parallel tasks in applications with multiple
interrelated functionalities, performance suffers
through task interferences and shared memory
congestions which lead to poor parallel speedups and
memory bandwidth utilizations. This paper proposes a
framework of stream processing based on-chip data
supply system for task-parallel MPSoCs. In this
framework, stream address generations and data
computations are decoupled and parallelized to allow
full utilization of on-chip resources. Task
granularities are dynamically tuned to jointly optimize
the overall application performance. Experiments show
that proposed framework as well as the tuning scheme
are effective for joint optimization in task-parallel
MPSoCs.",
acknowledgement = ack-nhfb,
affiliation = "Kong, J (Reprint Author), Shanghai Jiao Tong Univ, Sch
Elect Informat \& Elect Engn, Shanghai 200030, Peoples
R China. Kong, Ji; Liu, Peilin, Shanghai Jiao Tong
Univ, Sch Elect Informat \& Elect Engn, Shanghai
200030, Peoples R China.",
author-email = "johnhophen@sjtu.edu.cn liupeilin@sjtu.edu.cn
zhyu@cn.ibm.com",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "IBM Research-China under the IBM",
funding-text = "This work has been partially supported by IBM
Research-China under the IBM Ph.D. Fellowship program
for the 2010-2011 academic year.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application studies resulting in better
multiple-processor systems; atomic streaming;
Bandwidth; data computations; Memory hierarchy;
Multi-core/single-chip multiprocessors; Multicore
processing; Multiple Data Stream Architectures
(Multiprocessors); Multiprocessing systems;
multiprocessor system-on-chip; on-chip data supply
system; Prefetching; shared memory congestions; shared
memory systems; stream address generations; stream
programming paradigm; Streaming media;
System-on-a-chip; system-on-chip; task interferences;
task-parallel MPSoC; Throughput",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kong:2012:ASF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Deb:2012:HSC,
author = "Abhishek Deb and Josep Maria Codina and Antonio
Gonzalez",
title = "A {HW\slash SW} Co-designed Programmable Functional
Unit",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "In this paper, we propose a novel programmable
functional unit (PFU) to accelerate general purpose
application execution on a modern out-of-order x86
processor. Code is transformed and instructions are
generated that run on the PFU using a co-designed
virtual machine (Cd-VM). Results presented in this
paper show that this HW/SW co-designed approach
produces average speedups in performance of 29\% in
SPECFP and 19\% in SPECINT, and up-to 55\%, over modern
out-of-order processor.",
acknowledgement = ack-nhfb,
affiliation = "Deb, A (Reprint Author), Univ Politecn Cataluna, C
Jordi Girona 1-3, Barcelona, Spain. Deb, Abhishek;
Gonzalez, Antonio, Univ Politecn Cataluna, Barcelona,
Spain. Maria Codina, Josep; Gonzalez, Antonio, Intel
Res Labs Barcelona, Barcelona, Spain.",
author-email = "abhishek@ac.upc.edu josep.m.codina@intel.com
antonio@intel.com",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; hardware-software codesign;
Hardware/software interfaces; hw/sw co-designed;
Interface states; Load modeling; Micro-architecture
implementation considerations; Microarchitecture;
Processor Architectures; programmable functional unit;
Programmable functional units; Registers; virtual
machine",
number-of-cited-references = "13",
ORCID-numbers = "Gonzalez, Antonio/0000-0002-0009-0996",
research-areas = "Computer Science",
researcherid-numbers = "Gonzalez, Antonio/I-2961-2014",
times-cited = "0",
unique-id = "Deb:2012:HSC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Piscitelli:2012:HLP,
author = "Roberta Piscitelli and Andy D. Pimentel",
title = "A High-Level Power Model for {MPSoC} on {FPGA}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents a framework for high-level power
estimation of multiprocessor systems-on-chip (MPSoC)
architectures on FPGA. The technique is based on
abstract execution profiles, called event signatures.
As a result, it is capable of achieving good evaluation
performance, thereby making the technique highly useful
in the context of early system-level design space
exploration. We have integrated the power estimation
technique in a system-level MPSoC synthesis framework.
Using this framework, we have designed a range of
different candidate MPSoC architectures and compared
our power estimation results to those from real
measurements on a Virtex-6 FPGA board.",
acknowledgement = ack-nhfb,
affiliation = "Piscitelli, R (Reprint Author), Univ Amsterdam, Inst
Informat, NL-1012 WX Amsterdam, Netherlands.
Piscitelli, Roberta; Pimentel, Andy D., Univ Amsterdam,
Inst Informat, NL-1012 WX Amsterdam, Netherlands.",
author-email = "r.piscitelli@uva.nl a.d.pimentel@uva.nl",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "MADNESS STREP",
funding-text = "This work has been partially supported by the MADNESS
STREP-FP7 European Project.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstract execution profiles; Computational modeling;
Computer architecture; Estimation; event signatures;
Field programmable gate arrays; field programmable gate
arrays; Field programmable gate arrays; Formal models;
High-level power estimation; high-level power
estimation framework; high-level power model;
integrated circuit design; Mathematical model;
Microprocessors; MPSoC on FPGA; multiprocessing
systems; multiprocessor systems-on-chip architectures;
Performance Analysis and Design Aids; performance
evaluation; power aware computing; Power demand; power
estimation technique; Simulation; system-level design
space exploration; system-level MPSoC design space
exploration; system-level MPSoC synthesis framework;
system-on-chip; Virtex-6 FPGA board",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Piscitelli:2012:HLP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Finlayson:2012:OSP,
author = "Ian Finlayson and Gang-Ryung Uh and David Whalley and
Gary Tyson",
title = "An Overview of Static Pipelining",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A new generation of mobile applications requires
reduced energy consumption without sacrificing
execution performance. In this paper, we propose to
respond to these conflicting demands with an innovative
statically pipelined processor supported by an
optimizing compiler. The central idea of the approach
is that the control during each cycle for each portion
of the processor is explicitly represented in each
instruction. Thus the pipelining is in effect
statically determined by the compiler. The benefits of
this approach include simpler hardware and that it
allows the compiler to perform optimizations that are
not possible on traditional architectures. The initial
results indicate that static pipelining can
significantly reduce power consumption without
adversely affecting performance.",
acknowledgement = ack-nhfb,
affiliation = "Finlayson, I (Reprint Author), Florida State Univ,
Dept Comp Sci, Tallahassee, FL 32306 USA. Finlayson,
Ian; Whalley, David; Tyson, Gary, Florida State Univ,
Dept Comp Sci, Tallahassee, FL 32306 USA. Uh,
Gang-Ryung, Boise State Univ, Dept Comp Sci, Boise, ID
83725 USA.",
author-email = "finlayso@cs.fsu.edu uh@cs.boisestate.edu
whalley@cs.fsu.edu tyson@cs.fsu.edu",
da = "2019-06-20",
doc-delivery-number = "953VM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CNS-0964413, CNS-0915926]",
funding-text = "We thank the anonymous reviewers for their
constructive comments and suggestions. This research
was supported in part by NSF grants CNS-0964413 and
CNS-0915926.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; Energy
consumption; energy consumption reduction; execution
performance; General; mobile applications; optimising
compilers; Optimization; optimizing compiler; Pipeline
processing; pipeline processing; Pipeline processors;
power aware computing; Radio frequency; Registers;
statically pipelined processor",
number-of-cited-references = "14",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Finlayson:2012:OSP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wu:2012:CID,
author = "Lisa Wu and Martha A. Kim and Stephen A. Edwards",
title = "Cache Impacts of Datatype Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware acceleration is a widely accepted solution
for performance and energy efficient computation
because it removes unnecessary hardware for general
computation while delivering exceptional performance
via specialized control paths and execution units. The
spectrum of accelerators available today ranges from
coarse-grain off-load engines such as GPUs to
fine-grain instruction set extensions such as SSE. This
research explores the benefits and challenges of
managing memory at the data-structure level and
exposing those operations directly to the ISA. We call
these instructions Abstract Datatype Instructions
(ADIs). This paper quantifies the performance and
energy impact of ADIs on the instruction and data cache
hierarchies. For instruction fetch, our measurements
indicate that ADIs can result in 21-48\% and 16-27\%
reductions in instruction fetch time and energy
respectively. For data delivery, we observe a 22-40\%
reduction in total data read/write time and 9-30\% in
total data read/write energy.",
acknowledgement = ack-nhfb,
affiliation = "Wu, L (Reprint Author), Columbia Univ, Dept Comp Sci,
New York, NY 10027 USA. Wu, Lisa; Kim, Martha A.;
Edwards, Stephen A., Columbia Univ, Dept Comp Sci, New
York, NY 10027 USA.",
author-email = "lisa@cs.columbia.edu martha@cs.columbia.edu
sedwards@cs.columbia.edu",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstract data types; abstract datatype instruction;
Accelerators; ADI; cache hierarchy; Cache Hierarchy;
cache hierarchy; Cache memories; cache storage; coarse
grain off-load engine; data read-write energy; data
structure level; Data Structures; energy conservation;
energy efficient computation; energy impact; execution
unit; fine grain instruction set extension; hardware
acceleration; Hardware acceleration; hardware
acceleration; Hardware/software interfaces; Instruction
fetch; instruction fetch energy; instruction fetch
time; Instruction Set Extensions; instruction sets;
ISA; Memory hierarchy; memory management; Memory
Structures; Multicore processing; power aware
computing; Program processors; Support vector machines;
Vectors",
number-of-cited-references = "15",
ORCID-numbers = "Edwards, Stephen/0000-0003-2609-4861",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wu:2012:CID",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2012:RL,
author = "Anonymous",
title = "2011 Reviewers List",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "25--26",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Lists the reviewers who contributed to IEEE Computer
Architecture Letters in 2011.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "IEEE publishing",
}
@Article{Anonymous:2012:TNQ,
author = "Anonymous",
title = "There now is a quick and easy way to find out about
our collection of {{\booktitle{Transactions}}}
[Advertisement]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "26--26",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement: Visit http://www.computer.org/whats-new
today!",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:ACP,
author = "Anonymous",
title = "Advertisement --- {Conference Publishing Services
(CPS)}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "28--28",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "IEEE Conference Publishing Services (CPS)
advertisement.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:AI,
author = "Anonymous",
title = "2011 Annual Index",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "??--??",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This index covers all technical items --- papers,
correspondence, reviews, etc. --- that appeared in this
periodical during the year, and items from previous
years that were commented upon or corrected in this
year. Departments and other items may also be covered
if they have been judged to have archival value. The
Author Index contains the primary entry for each item,
listed under the first author's name. The primary entry
includes the co-authors' names, the title of the paper
or other item, and its location, specified by the
publication abbreviation, year, month, and inclusive
pagination. The Subject Index contains entries
describing the item under all appropriate subject
headings, plus the first author's name, the publication
abbreviation, month, and year, and inclusive pages.
Note that the item title is found only under he primary
entry in the Author Index.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Anonymous:2012:Ca,
author = "Anonymous",
title = "{[Cover2]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:Cb,
author = "Anonymous",
title = "{[Cover3]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:FCT,
author = "Anonymous",
title = "[{Front} cover and table of contents]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the table of contents for this issue of the
periodical.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:ICS,
author = "Anonymous",
title = "{IEEE Computer Society} [Back cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current committee members and
society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Davis:2012:IVL,
author = "John D. Davis and Suzanne Rivoire and Moises
Goldszmidt and Ehsan K. Ardestani",
title = "Including Variability in Large-Scale Cluster Power
Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "29--32",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Studying the energy efficiency of large-scale computer
systems requires models of the relationship between
resource utilization and power consumption. Prior work
on power modeling assumes that models built for a
single node will scale to larger groups of machines.
However, we find that inter-node variability in
homogeneous clusters leads to substantially different
models for different nodes. Moreover, ignoring this
variability will result in significant prediction
errors when scaled to the cluster level. We report on
inter-node variation for four homogeneous five-node
clusters using embedded, laptop, desktop, and server
processors. The variation is manifested quantitatively
in the prediction error and qualitatively on the
resource utilization variables (features) that are
deemed relevant for the models. These results
demonstrate the need to sample multiple machines in
order to produce accurate cluster models.",
acknowledgement = ack-nhfb,
affiliation = "Rivoire, Suzanne, Sonoma State Univ, Rohnert Pk, CA
94928 USA. Ardestani, Ehsan K., Univ CA, Santa Cruz, CA
USA.",
author-email = "john.d@microsoft.com suzanne.rivoire@sonoma.edu
moises@microsoft.com eka@soe.ucsc.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; Data models; evaluation;
Measurement; modeling; Power demand; Power Management;
Power measurement; Predictive models; Radiation
detectors; Servers; simulation of multiple-processor
systems",
number-of-cited-references = "26",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Davis:2012:IVL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lakshminarayana:2012:DSP,
author = "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon
Kim and Jinwoo Shin",
title = "{DRAM} Scheduling Policy for {GPGPU} Architectures
Based on a Potential Function",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "33--36",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "GPGPU architectures (applications) have several
different characteristics compared to traditional CPU
architectures (applications): highly multithreaded
architectures and SIMD-execution behavior are the two
important characteristics of GPGPU computing. In this
paper, we propose a potential function that models the
DRAM behavior in GPGPU architectures and a DRAM
scheduling policy, alpha-SJF policy to minimize the
potential function. The scheduling policy essentially
chooses between SJF and FR-FCFS at run-time based on
the number of requests from each thread and whether the
thread has a row buffer hit.",
acknowledgement = ack-nhfb,
affiliation = "Lakshminarayana, NB (Reprint Author), Georgia Inst
Technol, Sch Comp Sci, Atlanta, GA 30332 USA.
Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon;
Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci,
Atlanta, GA 30332 USA.",
author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu
hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; DRAM chips;
DRAM scheduling; DRAM scheduling policy; dynamic random
access memory; Equations; general-purpose graphics
processing unit; GPGPU; GPGPU architecture; graphics
processing units; Instruction sets; Mathematical model;
multi-threading; multithreaded architecture; Potential
function; potential function; Potential function;
Processor scheduling; Random access memory; row buffer
hit; scheduling; SIMD-execution behavior",
number-of-cited-references = "5",
research-areas = "Computer Science",
researcherid-numbers = "Shin, Jinwoo/M-5389-2013",
times-cited = "7",
unique-id = "Lakshminarayana:2012:DSP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2012:ISA,
author = "Yaohua Wang and Shuming Chen and Kai Zhang and
Jianghua Wan and Xiaowen Chen and Hu Chen and Haibo
Wang",
title = "Instruction Shuffle: Achieving {MIMD}-like Performance
on {SIMD} Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "37--40",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.34",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "SIMD architectures are less efficient for applications
with the diverse control-flow behavior, which can be
mainly attributed to the requirement of the identical
control-flow. In this paper, we propose a novel
instruction shuffle scheme that features an efficient
control-flow handling mechanism. The cornerstones are
composed of a shuffle source instruction buffer array
and an instruction shuffle unit. The shuffle unit can
concurrently deliver instructions of multiple distinct
control-flows from the instruction buffer array to
eligible SIMD lanes. Our instruction shuffle scheme
combines the best attributes of both the SIMD and MIMD
execution paradigms. Experimental results show that, an
average performance improvement of 86\% can be
achieved, at a cost of only 5.8\% area overhead.",
acknowledgement = ack-nhfb,
affiliation = "Wang, YH (Reprint Author), Natl Univ Def Technol, Sch
Comp Sci, Changsha, Hunan, Peoples R China. Wang,
Yaohua; Chen, Shuming; Zhang, Kai; Wan, Jianghua; Chen,
Xiaowen; Chen, Hu; Wang, Haibo, Natl Univ Def Technol,
Sch Comp Sci, Changsha, Hunan, Peoples R China.",
author-email = "nudtyh@gmail.com",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Natural Science Foundation of
China [61070036, 61133007]; National 863 Program of
China [2009AA011704]",
funding-text = "The work is partially supported by the National
Natural Science Foundation of China (No. 61070036), the
National Natural Science Foundation of China (No.
61133007), the National 863 Program of China (No.
2009AA011704).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arrays; data dependent control-flow; diverse
control-flow behavior; identical control-flow behavior;
instruction buffer array; Instruction sets; instruction
shuffle; instruction shuffle unit; Kernel; MIMD
execution paradigm; MIMD-like performance; multiple
instruction multiple data; parallel processing; Process
control; Resource management; Scalability; shuffle
source instruction buffer array; SIMD; SIMD
architecture; SIMD execution paradigm; single
instruction multiple data; Vectors",
number-of-cited-references = "9",
research-areas = "Computer Science",
researcherid-numbers = "Chen, Shuming/Q-1147-2018",
times-cited = "6",
unique-id = "Wang:2012:ISA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Panda:2012:BFB,
author = "Reena Panda and Paul V. Gratz and Daniel A.
Jim{\'e}nez",
title = "{B-Fetch}: Branch Prediction Directed Prefetching for
In-Order Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "41--44",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.33",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Computer architecture is beset by two opposing trends.
Technology scaling and deep pipelining have led to high
memory access latencies; meanwhile, power and energy
considerations have revived interest in traditional
in-order processors. In-order processors, unlike their
superscalar counterparts, do not allow execution to
continue around data cache misses. In-order processors,
therefore, suffer a greater performance penalty in the
light of the current high memory access latencies.
Memory prefetching is an established technique to
reduce the incidence of cache misses and improve
performance. In this paper, we introduce B-Fetch, a new
technique for data prefetching which combines branch
prediction based lookahead deep path speculation with
effective address speculation, to efficiently improve
performance in in-order processors. Our results show
that B-Fetch improves performance 38.8\% on SPEC
CPU2006 benchmarks, beating a current, state-of-the-art
prefetcher design at similar to 1/3 the hardware
overhead.",
acknowledgement = ack-nhfb,
affiliation = "Panda, R (Reprint Author), Texas A\&M Univ, Dept Elect
\& Comp Engn, CESG, College Stn, TX 77843 USA. Panda,
Reena; Gratz, Paul V., Texas A\&M Univ, Dept Elect \&
Comp Engn, CESG, College Stn, TX 77843 USA. Jimenez,
Daniel A., Univ Texas San Antonio, Dept Comp Sci, San
Antonio, TX USA.",
author-email = "reena.panda@tamu.edu pgratz@tamu.edu dj@cs.utsa.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address speculation; B-fetch; Benchmark testing;
Branch Prediction; branch prediction based lookahead
deep path speculation; branch prediction directed
prefetching; Cache memory; computer architecture;
Computer architecture; data cache; Data Cache
Prefetching; deep pipelining; energy consideration;
Hardware; in-order processor; In-order Processors;
memory access latency; memory prefetching; Memory
Systems; Pipelines; power aware computing; power
consideration; Prefetching; Process control; Registers;
storage management; superscalar processor; technology
scaling; Value Prediction",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Panda:2012:BFB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Miller:2012:MEP,
author = "Timothy N. Miller and Renji Thomas and Radu
Teodorescu",
title = "Mitigating the Effects of Process Variation in
Ultra-low Voltage Chip Multiprocessors using Dual
Supply Voltages and Half-Speed Units",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.36",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Energy efficiency is a primary concern for
microprocessor designers. One very effective approach
to improving processor energy efficiency is to lower
its supply voltage to very near to the transistor
threshold voltage. This reduces power consumption
dramatically, improving energy efficiency by an order
of magnitude. Low voltage operation, however, increases
the effects of parameter variation resulting in
significant frequency heterogeneity between (and
within) otherwise identical cores. This heterogeneity
severely limits the maximum frequency of the entire
CMP. We present a combination of techniques aimed at
reducing the effects of variation on the performance
and energy efficiency of near-threshold, many-core
CMPs. Dual Voltage Rail (DVR), mitigates core-to-core
variation with a dual-rail power delivery system that
allows post-manufacturing assignment of different
supply voltages to individual cores. This speeds up
slow cores by assigning them to a higher voltage and
saves power on fast cores by assigning them to a lower
voltage. Half-Speed Unit (HSU) mitigates within-core
variation by halving the frequency of select functional
blocks with the goal of boosting the frequency of
individual cores, thus raising the frequency ceiling
for the entire CMP. Together, these variation-reduction
techniques result in almost 50\% improvement in CMP
performance for the same power consumption over a mix
of workloads.",
acknowledgement = ack-nhfb,
affiliation = "Miller, TN (Reprint Author), Ohio State Univ, Dept
Comp Sci \& Engn, Columbus, OH 43210 USA. Miller,
Timothy N.; Thomas, Renji; Teodorescu, Radu, Ohio State
Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA.",
author-email = "millerti@cse.ohio-state.edu thomasr@cse.ohio-state.edu
teodores@cse.ohio-state.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-1117799]",
funding-text = "This work was supported in part by the National
Science Foundation under grant CCF-1117799 and an
allocation of computing time from the Ohio
Supercomputer Center. The authors would like to thank
the anonymous reviewers for their suggestions and
feedback.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; chip multiprocessors; Clocks; CMP
frequency ceiling; CMP performance; Computer
architecture; core-to-core variation; Delay; dual
supply voltage; dual voltage rail; dual-rail power
delivery system; energy conservation; Energy
efficiency; energy efficiency; Energy efficiency;
frequency heterogeneity; half-speed unit; low voltage
operation; microprocessor chips; microprocessor design;
Multiprocessing systems; near-threshold voltage;
parameter variation; power aware computing; power
consumption; Power demand; process variation; process
variation effect; Rails; supply voltage assignment;
Threshold voltage; transistor threshold voltage;
ultra-low voltage chip multiprocessors; within-core
variation",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Miller:2012:MEP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Li:2012:LSS,
author = "Yong Li and Rami Melhem and Alex K. Jones",
title = "Leveraging Sharing in Second Level
Translation-Lookaside Buffers for Chip
Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.35",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Traversing page table during virtual to physical
address translation causes significant pipeline stalls
when misses occur in the translation-lookaside buffer
(TLB). To mitigate this penalty, we propose a fast,
scalable, multi-level TLB organization that leverages
page sharing behaviors and performs efficient TLB entry
placement. Our proposed partial sharing TLB (PSTLB)
reduces TLB misses by around 60\%. PSTLB also improves
TLB performance by nearly 40\% compared to traditional
private TLBs and 17\% over the state of the art
scalable TLB proposal.",
acknowledgement = ack-nhfb,
affiliation = "Li, Y (Reprint Author), Univ Pittsburgh, Dept Elect \&
Comp Engn, Pittsburgh, PA 15261 USA. Li, Yong, Univ
Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA
15261 USA.",
author-email = "yol26@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0702452]",
funding-text = "This work is supported by NSF award CCF-0702452",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; buffer storage; chip
multiprocessor; CMPs; Fluids; microprocessor chips;
multilevel TLB organization; multiprocessing systems;
Oceans; page sharing behavior; Partial Sharing; partial
sharing TLB; Prefetching; private TLB; program
interpreters; Runtime; second level
translation-lookaside buffers; Tiles; TLB entry
placement; TLBs; Virtual private networks;
virtual-to-physical address translation",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Li:2012:LSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Delimitrou:2012:DDS,
author = "Christina Delimitrou and Sriram Sankar and Kushagra
Vaid and Christos Kozyrakis",
title = "Decoupling Datacenter Storage Studies from Access to
Large-Scale Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.37",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Suboptimal storage design has significant cost and
power impact in large-scale datacenters (DCs).
Performance, power and cost-optimized systems require
deep understanding of target workloads, and mechanisms
to effectively model different storage design choices.
Traditional benchmarking is invalid in cloud
data-stores, representative storage profiles are hard
to obtain, while replaying applications in different
storage configurations is impractical both in cost and
time. Despite these issues, current workload generators
are not able to reproduce key aspects of real
application patterns (e.g., spatial/temporal locality,
I/O intensity). In this paper, we propose a modeling
and generation framework for large-scale storage
applications. As part of this framework we use a state
diagram-based storage model, extend it to a
hierarchical representation, and implement a tool that
consistently recreates DC application I/O loads. We
present the principal features of the framework that
allow accurate modeling and generation of storage
workloads, and the validation process performed against
ten original DC application traces. Finally, we explore
two practical applications of this methodology: SSD
caching and defragmentation benefits on enterprise
storage. Since knowledge of the workload's spatial and
temporal locality is necessary to model these use
cases, our framework was instrumental in quantifying
their performance benefits. The proposed methodology
provides detailed understanding of the storage activity
of large-scale applications, and enables a wide
spectrum of storage studies, without the requirement to
access application code and full application
deployment.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Stanford Univ,
Stanford, CA 94305 USA. Delimitrou, Christina;
Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
USA. Sankar, Sriram; Vaid, Kushagra, Microsoft Corp,
Seattle, WA USA.",
author-email = "cdel@stanford.edu srsankar@microsoft.com
kvaid@microsoft.com kozyraki@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cloud data-store; Computational modeling; computer
centres; cost impact; datacenter storage; Electronic
mail; enterprise storage defragmentation; Generators;
large-scale datacenter; Load modeling; Mass storage;
Modeling of computer architecture; Modeling techniques;
power impact; SSD caching; state diagram-based storage
model; Storage area networks; storage design choice;
storage management; storage profile; storage workload;
suboptimal storage design; Super (very large)
computers; Throughput; Very large scale integration;
workload spatial locality; workload temporal locality",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Delimitrou:2012:DDS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chen:2012:NPD,
author = "Jie Chen and Guru Venkataramani and Gabriel Parmer",
title = "The Need for Power Debugging in the Multi-Core
Environment",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Debugging an application for power has a wide array of
benefits ranging from minimizing the thermal hotspots
to reducing the likelihood of CPU malfunction. In this
work, we justify the need for power debugging, and show
that performance debugging of a parallel application
does not automatically guarantee power balance across
multiple cores. We perform experiments and show our
results using two case study benchmarks, Volrend from
Splash-2 and Bodytrack from Parsec-1.0.",
acknowledgement = ack-nhfb,
affiliation = "Chen, J (Reprint Author), George Washington Univ,
Washington, DC 20052 USA. Chen, Jie; Venkataramani,
Guru; Parmer, Gabriel, George Washington Univ,
Washington, DC 20052 USA.",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-1117243]",
funding-text = "This material is based upon work supported in part by
the National Science Foundation under Grant No.
CCF-1117243.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Bodytrack; Debugging; Instruction
sets; Multi-cores; multicore environment; Multicore
processing; multiprocessing systems; parallel
application; parallel programming; Parsec-1.0;
performance debugging; power aware computing; power
balance; Power Debugging; power debugging; Power
Debugging; Power demand; Power Imbalance; program
debugging; Splash-2; Volrend",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Chen:2012:NPD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Meza:2012:EES,
author = "Justin Meza and Jichuan Chang and HanBin Yoon and Onur
Mutlu and Parthasarathy Ranganathan",
title = "Enabling Efficient and Scalable Hybrid Memories Using
Fine-Granularity {DRAM} Cache Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hybrid main memories composed of DRAM as a cache to
scalable non-volatile memories such as phase-change
memory (PCM) can provide much larger storage capacity
than traditional main memories. A key challenge for
enabling high-performance and scalable hybrid memories,
though, is efficiently managing the metadata (e.g.,
tags) for data cached in DRAM at a fine granularity.
Based on the observation that storing metadata off-chip
in the same row as their data exploits DRAM row buffer
locality, this paper reduces the overhead of
fine-granularity DRAM caches by only caching the
metadata for recently accessed rows on-chip using a
small buffer. Leveraging the flexibility and efficiency
of such a fine-granularity DRAM cache, we also develop
an adaptive policy to choose the best granularity when
migrating data into DRAM. On a hybrid memory with a
512MB DRAM cache, our proposal using an 8KB on-chip
buffer can achieve within 6\% of the performance of,
and 18\% better energy efficiency than, a conventional
8MB SRAM metadata store, even when the energy overhead
due to large SRAM metadata storage is not considered.",
acknowledgement = ack-nhfb,
affiliation = "Meza, J (Reprint Author), Carnegie Mellon Univ,
Pittsburgh, PA 15213 USA. Meza, Justin; Yoon, HanBin;
Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15213
USA. Chang, Jichuan; Ranganathan, Parthasarathy,
Hewlett Packard Labs, Palo Alto, CA USA.",
author-email = "meza@cmu.edu jichuan.chang@hp.com hanbinyoon@cmu.edu
onur@cmu.edu partha.ranganathan@hp.com",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF CAREER [CCF-0953246]; NSF EAGER
[CCF-1147397]; Gigascale Systems Research Center",
funding-text = "We thank the members of the SAFARI research group and
the anonymous reviewers for their comments and
suggestions. We gratefully acknowledge the support of
an NSF CAREER Award CCF-0953246, NSF EAGER Grant
CCF-1147397, and the Gigascale Systems Research Center.
Part of this work was done while Justin Meza and HanBin
Yoon were interns at Hewlett-Packard Labs.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Buffer storage; Cache memories; Cache
memory; cache storage; data migration; DRAM chips; DRAM
row buffer locality; dynamic random access memory;
fine-granularity DRAM cache management; hybrid main
memories; hybrid main memory; Indexes; Memory
management; meta data; metadata caching; metadata
management; metadata storage; non-volatile memories;
Phase change materials; phase-change memory; Random
access memory; scalable hybrid memory;
System-on-a-chip; tag storage",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "35",
unique-id = "Meza:2012:EES",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zidenberg:2012:MHS,
author = "Tsahee Zidenberg and Isaac Keslassy and Uri Weiser",
title = "{MultiAmdahl}: How Should {I} Divide My Heterogeneous
Chip?",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "65--68",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Future multiprocessor chips will integrate many
different units, each tailored to a specific
computation. When designing such a system, a chip
architect must decide how to distribute the available
limited system resources, such as area and power, among
all the computational units. In this paper, we
introduce MultiAmdahl, an analytical optimization
technique for resource sharing among heterogeneous
units. MultiAmdahl takes into account the workload, the
performance of each computational unit, and the total
available resource. The results obtained by MultiAmdahl
allow us, for example, to provide a closed-form
solution for an optimal asymmetric-offload chip, and to
analyze the impact of different design constraints on
an optimal chip architecture.",
acknowledgement = ack-nhfb,
affiliation = "Zidenberg, T (Reprint Author), Technion Israel Inst
Technol, EE Dept, Haifa, Israel. Zidenberg, Tsahee;
Keslassy, Isaac; Weiser, Uri, Technion Israel Inst
Technol, EE Dept, Haifa, Israel.",
author-email = "tsahee@tx.technion.ac.il isaac@ee.technion.ac.il
weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Research Council [210389]; Intel
Heterogeneous Computing research grant",
funding-text = "This work was partly supported by the European
Research Council Starting Grant No. 210389 and by the
Intel Heterogeneous Computing research grant.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "area resource; asymmetric-offload chip; Central
Processing Unit; Chip Multiprocessors; Computational
modeling; computational unit; Computer architecture;
design constraint; heterogeneous chip; heterogeneous
unit; Mathematical model; microprocessor chips;
Modeling of computer architecture; MultiAmdahl
analytical optimization technique; multiprocessing
systems; multiprocessor chip; optimal chip
architecture; Optimization; power resource; Program
processors; resource allocation; Resource management;
resource sharing",
keywords-plus = "AMDAHLS LAW",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "12",
unique-id = "Zidenberg:2012:MHS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2012:BC,
author = "Anonymous",
title = "[{Back} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.38",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:BIC,
author = "Anonymous",
title = "[{Back} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.37",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:FIC,
author = "Anonymous",
title = "[{Front} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.36",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2013:INE,
author = "Kevin Skadron",
title = "Introducing the New {Editor-in-Chief} of the
{{\booktitle{IEEE Computer Architecture Letters}}}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "1--1",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The out-going Editor-in-Chief introduces Jose F.
Mart{\'\i}nez as the new Editor-in-Chief (EIC) of the
IEEE Computer Architecture Letters (CAL). A brief
professional biography is included. In addition, it is
noted that CAL aims to provide fast-turnaround for
early work with outstanding promise. The majority of
decisions are returned within one month, nearly all
within six weeks, and all decisions are rendered within
two months. The overall acceptance rate has
consistently run at about 25\%. Many papers first
published in CAL go on to become full papers in premier
conferences and journals, and CAL's impact factor
continues to increase. CAL has been a valuable addition
to the publishing landscape in computer architecture
and under Prof. Martinez's leadership, we can look
forward to even greater impact in the future. I would
like to take this opportunity to thank all of the CAL
Associate Editors, authors, readers, and reviewers for
their great help and support.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2013:INE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2013:AI,
author = "Anonymous",
title = "2012 Annual Index",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This index covers all technical items - papers,
correspondence, reviews, etc. - that appeared in this
periodical during the year, and items from previous
years that were commented upon or corrected in this
year. Departments and other items may also be covered
if they have been judged to have archival value. The
Author Index contains the primary entry for each item,
listed under the first author's name. The primary entry
includes the co-authors' names, the title of the paper
or other item, and its location, specified by the
publication abbreviation, year, month, and inclusive
pagination. The Subject Index contains entries
describing the item under all appropriate subject
headings, plus the first author's name, the publication
abbreviation, month, and year, and inclusive pages.
Note that the item title is found only under the
primary entry in the Author Index.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Eeckhout:2013:MNE,
author = "Lieven Eeckhout",
title = "A Message from the New {Editor-in-Chief} and
Introduction of New {Associate Editors}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "2--2",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
affiliation = "Eeckhout, L (Reprint Author), Univ Ghent, B-9000
Ghent, Belgium.",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Eeckhout:2013:MNE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Martinez:2013:MNE,
author = "J. Martinez",
title = "A Message from the New {Editor-in-Chief} and
Introduction of New {Associate} Editors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "2--4",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The incoming Editor-in-Chief states that his goal
during his tenure with IEEE Computer Architecture
Letters (CAL) will be to further increase its
visibility in our research community, and to attract
more submissions from computer architecture leaders.
The {"Best} of {CAL"} session at HPCA, which has taken
place for the last couple of years, is a good step in
this direction. He is also committed to continue
improving the coordination with authors and conference
program chairs, and to consolidate CAL's unique place
in the publication pipeline as the prime venue for
quick dissemination of high-quality novel ideas and
early results.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Tavakkol:2013:NSS,
author = "Arash Tavakkol and Mohammad Arjomand and Hamid
Sarbazi-Azad",
title = "{Network-on-SSD}: a Scalable and High-Performance
Communication Design Paradigm for {SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In recent years, flash memory solid state disks (SSDs)
have shown a great potential to change storage
infrastructure because of its advantages of high speed
and high throughput random access. This promising
storage, however, greatly suffers from performance loss
because of frequent ``erase-before-write'' and
``garbage collection'' operations. Thus. novel
circuit-level, architectural, and algorithmic
techniques are currently explored to address these
limitations. In parallel with others, current study
investigates replacing shared buses in multi-channel
architecture of SSDs with an interconnection network to
achieve scalable, high throughput, and reliable SSD
storage systems. Roughly speaking, such a communication
scheme provides superior parallelism that allows us to
compensate the main part of the performance loss
related to the aforementioned limitations through
increasing data storage and retrieval processing
throughput.",
acknowledgement = ack-nhfb,
affiliation = "Tavakkol, A (Reprint Author), Sharif Univ Technol,
Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol,
Arash; Arjomand, Mohammad; Sarbazi-Azad, Hamid, Sharif
Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran.
Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
Comp Sci, Tehran, Iran.",
author-email = "tavakkol@ce.sharif.edu arjomand@ce.sharif.edu
azad@sharif.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "algorithmic technique; architectural technique;
Bandwidth; Buffer storage; circuit-level technique;
Complexity theory; Data storage systems; data storage
throughput; flash memories; Flash memory; flash memory
solid state disks; frequent erase-before-write
operations; garbage collection operations; high speed
random access; high throughput random access;
high-performance communication design paradigm;
integrated circuit design; integrated circuit
reliability; Inter-package parallelism; interconnection
network; Interconnection network; interconnection
network; Interconnections (Subsystems); Mass storage;
memory architecture; multichannel architecture;
multiprocessor interconnection networks;
network-on-chip; network-on-SSD; parallel memories;
Parallel processing; parallel storage; performance
evaluation; performance loss; retrieval processing
throughput; scalable communication design paradigm;
Solid state disk; SSD storage system reliability;
storage infrastructure; storage management; system
buses; Throughput",
keywords-plus = "MEMORY",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Tavakkol:2013:NSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sun:2013:NWC,
author = "Guang Sun and Chia-Wei Chang and Bill Lin",
title = "A New Worst-Case Throughput Bound for Oblivious
Routing in Odd Radix Mesh Network",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "1/2 network capacity is often believed to be the limit
of worst-case throughput for mesh networks. However,
this letter provides a new worst-case throughput bound,
which is higher than 1/2 network capacity, for odd
radix two-dimensional mesh networks. In addition, we
propose a routing algorithm called U2TURN that can
achieve this worst-case throughput bound. U2TURN
considers all routing paths with at most 2 turns and
distributes the traffic loads uniformly in both X and Y
dimensions. Theoretical analysis and simulation results
show that U2TURN outperforms existing routing
algorithms in worst-case throughput. Moreover, U2TURN
achieves good average-throughput at the expense of
approximately 1.5x minimal average hop count.",
acknowledgement = ack-nhfb,
affiliation = "Sun, G (Reprint Author), Tsinghua Univ, Beijing,
Peoples R China. Sun, Guang, Tsinghua Univ, Beijing,
Peoples R China. Chang, Chia-Wei; Lin, Bill, Univ Calif
San Diego, San Diego, CA 92103 USA.",
da = "2019-06-20",
doc-delivery-number = "172HT",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; average-case
throughput; Computer architecture; Interconnection
architectures; mesh; Mesh networks; network capacity;
network-on-chip; Networks-on-Chip (NoC); oblivious
routing; odd radix mesh network; odd radix
two-dimensional mesh network; On-chip interconnection
networks; Parallel algorithms; Routing; routing;
Routing; Routing protocols; Throughput; traffic load;
U2TURN; Worst-case analysis; worst-case throughput;
worst-case throughput bound",
number-of-cited-references = "10",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009",
times-cited = "1",
unique-id = "Sun:2013:NWC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Karsli:2013:EDT,
author = "I. Burak Karsli and Pedro Reviriego and M. Fatih Balli
and O{\u{g}}uz Ergin and J. A. Maestro",
title = "Enhanced Duplication: a Technique to Correct Soft
Errors in Narrow Values",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Soft errors are transient errors that can alter the
logic value of a register bit causing data corruption.
They can be caused by radiation particles such as
neutrons or alpha particles. Narrow values are commonly
found in the data consumed or produced by processors.
Several techniques have recently been proposed to
exploit the unused bits in narrow values to protect
them against soft errors. These techniques replicate
the narrow value over the unused register bits such
that errors can be detected when the value is
duplicated and corrected when the value is tripled. In
this letter, a technique that can correct errors when
the narrow value is only duplicated is presented. The
proposed approach stores a modified duplicate of the
narrow value such that errors on the original value and
the duplicate can be distinguished and therefore
corrected. The scheme has been implemented at the
circuit level to evaluate its speed and also at the
architectural level to assess the benefits in
correcting soft errors. The results show that the
scheme is significantly faster than a parity check and
can improve substantially the number of soft errors
that are corrected compared to existing techniques.",
acknowledgement = ack-nhfb,
affiliation = "Karsli, IB (Reprint Author), TOBB Univ Econ \&
Technol, Ankara, Turkey. Karsli, I. Burak; Balli, M.
Fatih; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol,
Ankara, Turkey. Reviriego, Pedro; Maestro, J. A., Univ
Antonio de Nebrija, Madrid, Spain.",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Education
[AYA2009-13300-C03]; Scientific and Technological
Research Council of Turkey (TUBITAK) [112E004]",
funding-text = "This work was supported in part by the Spanish
Ministry of Science and Education under Grant
AYA2009-13300-C03 and by the Scientific and
Technological Research Council of Turkey (TUBITAK)
under Grant 112E004. The work is a collaboration in the
framework of COST ICT Action 1103 ``Manufacturable and
Dependable Multicore Architectures at Nanoscale''.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "alpha particles; architectural level; Benchmark
testing; computer architecture; Data Cache; data
corruption; Data processing; enhanced duplication;
Error correction; Error Correction; Error correction;
Error-checking; Logic gates; logic value;
microprocessor chips; narrow values; Narrow Values;
narrow values; neutrons; Parity check codes;
processors; Program processors; radiation hardening
(electronics); radiation particles; Redundant design;
register bit; Registers; soft errors; Soft Errors; soft
errors",
number-of-cited-references = "11",
ORCID-numbers = "Sousa, Leonel/0000-0002-8066-221X Ergin,
O{\u{g}}uz/0000-0003-2701-3787 Maestro, Juan
Antonio/0000-0001-7133-9026 Reviriego,
Pedro/0000-0001-6805-6519",
research-areas = "Computer Science",
researcherid-numbers = "Sousa, Leonel/B-2749-2009 Ergin,
O{\u{g}}uz/E-5717-2010 Maestro, Juan
Antonio/L-6091-2014 Reviriego, Pedro/B-8353-2009",
times-cited = "2",
unique-id = "Karsli:2013:EDT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lyons:2013:SFF,
author = "Michael Lyons and Gu-Yeon Wei and David Brooks",
title = "{Shrink-Fit}: a Framework for Flexible Accelerator
Sizing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "RTL design complexity discouraged adoption of
reconfigurable logic in general purpose systems,
impeding opportunities for performance and energy
improvements. Recent improvements to HLS compilers
simplify RTL design and are easing this barrier. A new
challenge will emerge: managing reconfigurable
resources between multiple applications with custom
hardware designs. In this paper, we propose a method to
``shrink-fit' accelerators within widely varying fabric
budgets. Shrink-fit automatically shrinks existing
accelerator designs within small fabric budgets and
grows designs to increase performance when larger
budgets are available. Our method takes advantage of
current accelerator design techniques and introduces a
novel architectural approach based on fine-grained
virtualization. We evaluate shrink-fit using a
synthesized implementation of an IDCT for decoding
JPEGs and show the IDCT accelerator can shrink by a
factor of 16x with minimal performance and area
overheads. Using shrink-fit, application designers can
achieve the benefits of hardware acceleration with
single RTL designs on FPGAs large and small.",
acknowledgement = ack-nhfb,
affiliation = "Lyons, M (Reprint Author), Harvard Univ, Sch Engn \&
Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael; Wei,
Gu-Yeon; Brooks, David, Harvard Univ, Sch Engn \& Appl
Sci, Cambridge, MA 02138 USA.",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerators; computational complexity; Computer
applications; custom hardware design; Decoding;
discrete cosine transforms; fabric budget; field
programmable gate arrays; Field programmable gate
arrays; fine grained virtualization; flexible
accelerator sizing; FPGA; general purpose computers;
general purpose system; hardware acceleration;
Heterogeneous (hybrid) systems; HLS compiler; IDCT
accelerator; inverse transforms; JPEG decoding; program
compilers; Program processors; reconfigurable
architectural approach; reconfigurable architectures;
Reconfigurable hardware; reconfigurable logic;
reconfigurable resource management; RTL design
complexity; Runtime; shrink fit accelerator;
Special-Purpose and Application-Based Systems; temporal
logic; virtual machines; virtualisation",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lyons:2013:SFF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Duong:2013:CAS,
author = "Nam Duong and Alexander V. Veidenbaum",
title = "Compiler-Assisted, Selective Out-Of-Order Commit",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes an out-of-order instruction commit
mechanism using a novel compiler/architecture
interface. The compiler creates instruction ``blocks''
guaranteeing some commit conditions and the processor
uses the block information to commit certain
instructions out of order. Micro-architectural support
for the new commit mode is made on top of the standard,
ROB-based processor and includes out-of-order
instruction commit with register and load queue entry
release. The commit mode may be switched multiple times
during execution. Initial results for a 4-wide
processor show that, on average, 52\% instructions are
committed out of order resulting in 10\% to 26\%
speedups over in-order commit, with minimal hardware
overhead. The performance improvement is a result of an
effectively larger instruction window that allows more
cache misses to be overlapped for both L1 and L2
caches.",
acknowledgement = ack-nhfb,
affiliation = "Duong, N (Reprint Author), Univ Calif Irvine, Dept
Comp Sci, Irvine, CA 92717 USA. Duong, Nam; Veidenbaum,
Alexander V., Univ Calif Irvine, Dept Comp Sci, Irvine,
CA 92717 USA.",
author-email = "nlduong@ics.uci.edu alexv@ics.uci.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture/compiler co-design; Benchmark testing;
block information; cache misses; cache storage; Cache
storage; cache storage; Cache storage; commit
conditions; compiler-architecture interface;
compiler-assisted selective out-of-order commit;
computer architecture; Computer architecture; computer
architecture; dynamically-scheduled and
statically-scheduled implementation; Hardware/software
interfaces; instruction blocks; instruction sets; L1
cache; L2 cache; load queue entry release;
microarchitectural support; minimal hardware overhead;
Out of order instruction; Out-of-order commit;
out-of-order instruction commit mechanism; overlapping
cache misses; performance evaluation; performance
improvement; Pipeline implementation; Pipeline
processors; program compilers; Program processors;
register; resource release; RISC/CISC; ROB-based
processor; Superscalar; VLIW architectures; Von Neumann
architectures",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Duong:2013:CAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nilakantan:2013:MES,
author = "Siddharth Nilakantan and Steven Battle and Mark
Hempstead",
title = "Metrics for Early-Stage Modeling of Many-Accelerator
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The term `Dark Silicon'' has been coined to describe
the threat to microprocessor performance caused by
increasing transistor power density. Improving energy
efficiency is now the primary design goal for all
market segments of microprocessors from mobile to
server. Specialized hardware accelerators, designed to
run only a subset of workloads with orders of magnitude
energy efficiency improvement, are seen as a potential
solution. Selecting an ensemble of accelerators to best
cover the workloads run on a platform remains a
challenge. We propose metrics for accelerator selection
derived from a detailed communication-aware performance
model and present an automated methodology to populate
this model. Employing a combination of characterized
RTL and our selection metrics, we evaluate a set of
accelerators for a sample application and compare
performance to selections based on execution time and
Pollack's rule. We find that the architecture selected
by our communication-aware metric shows improved
performance over architectures selected based on
execution time and Pollack's rule, as they do not
account for speedup being limited by communication.",
acknowledgement = ack-nhfb,
affiliation = "Nilakantan, S (Reprint Author), Drexel Univ, Dept
Elect \& Comp Engn, Philadelphia, PA 19104 USA.
Nilakantan, Siddharth; Battle, Steven; Hempstead, Mark,
Drexel Univ, Dept Elect \& Comp Engn, Philadelphia, PA
19104 USA.",
author-email = "sn446@drexel.edu sjb328@drexel.edu mdh77@drexel.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerators; Code Profiling; communication-aware
performance model; Computer architecture; computer
architecture; Computer Systems Organization; dark
silicon; General; hardware accelerators; Heterogeneous
(hybrid) systems; Heterogeneous Architectures;
magnitude energy efficiency improvement;
many-accelerator architectures; microprocessor;
microprocessor chips; Modeling; Modeling of computer
architecture; modelling; Multiprocessing systems; Other
Architecture Styles; performance evaluation; Pollack
rule; Processor Architectures; Program processors; RTL;
transistor power density; transistors",
number-of-cited-references = "16",
ORCID-numbers = "Nilakantan, Siddharth/0000-0003-1067-700X",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Nilakantan:2013:MES",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Delimitrou:2013:NCD,
author = "Christina Delimitrou and Christos Kozyrakis",
title = "The {Netflix} Challenge: Datacenter Edition",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The hundreds of thousands of servers in modern
warehouse scale systems make performance and efficiency
optimizations pressing design challenges. These systems
are traditionally considered homogeneous. However, that
is not typically the case. Multiple server generations
compose a heterogeneous environment, whose performance
opportunities have not been fully explored since
techniques that account for platform heterogeneity
typically do not scale to the tens of thousands of
applications hosted in large-scale cloud providers. We
present ADSM, a scalable and efficient recommendation
system for application-to-server mapping in large-scale
datacenters (DCs) that is QoS-aware. ADSM overcomes the
drawbacks of previous techniques, by leveraging robust
and computationally efficient analytical methods to
scale to tens of thousands of applications with minimal
overheads. It is also OoS-aware, mapping applications
to platforms while enforcing strict QoS guarantees.
ADSM is derived from validated analytical models, has
low and bounded prediction errors, is simple to
implement and scales to thousands of applications
without significant changes to the system. Over 390
real DC workloads, ADSM improves performance by 16\% on
average and up to 2.5x and efficiency by 22\% in a DC
with 10 different server configurations.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Stanford Univ,
Stanford, CA 94305 USA. Delimitrou, Christina;
Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
USA.",
author-email = "cdel@stanford.edu kozyraki@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ADSM; application mapping; Application studies
resulting in better multiple-processor systems;
application-to-server mapping; Computer architecture;
computer centres; Computer System Implementation;
Computer Systems Organization; Data centers;
datacenter; design challenge; Design studies;
evaluation; Heterogeneous (hybrid) systems; Large and
Medium ( Mainframe ) Computers; Large-scale systems;
Measurement; modeling; Multiprocessing systems; Netflix
challenge; Other Architecture Styles; Parallel
Architectures; Performance of Systems; Processor
Architectures; QoS-aware; quality of service;
Scheduling; Scheduling and task partitioning; server
generation; simulation of multiple-processor systems;
Special-Purpose and Application-Based Systems; Super
(very large) computers; warehouse-scale system",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Delimitrou:2013:NCD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2013:RL,
author = "Anonymous",
title = "2012 reviewers list",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "33--34",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The publication offers a note of thanks and lists its
reviewers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "IEEE publishing",
}
@Article{Anonymous:2013:IOAa,
author = "Anonymous",
title = "{IEEE} Open Access Publishing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "35--35",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement: This publication offers open access
options for authors. IEEE open access publishing.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:ITN,
author = "Anonymous",
title = "{{\booktitle{IEEE Transactions}}} Newsletter",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "36--36",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement: Stay connected with the IEEE Computer
Society Transactions by signing up for our new
Transactions Connection newsletter. It is free and
contains valuable information.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Martinez:2013:E,
author = "J. F. Martinez",
title = "Editorial",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "37--38",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jian:2013:HPE,
author = "Xun Jian and John Sartori and Henry Duwe and Rakesh
Kumar",
title = "High Performance, Energy Efficient Chipkill Correct
Memory with Multidimensional Parity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "39--42",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is well-known that a significant fraction of server
power is consumed in memory; this is especially the
case for servers with chipkill correct memories. We
propose a new chipkill correct memory organization that
decouples correction of errors due to local faults that
affect a single symbol in a word from correction of
errors due to device-level faults that affect an entire
column, sub-bank, or device. By using a combination of
two codes that separately target these two fault modes,
the proposed chipkill correct organization reduces code
overhead by half as compared to conventional chipkill
correct memories for the same rank size. Alternatively,
this allows the rank size to be reduced by half while
maintaining roughly the same total code overhead.
Simulations using PARSEC and SPEC benchmarks show that,
compared to a conventional double chipkill correct
baseline, the proposed memory organization, by
providing double chipkill correct at half the rank
size, reduces power by up to 41\%, 32\% on average over
a conventional baseline with the same chipkill correct
strength and access granularity that relies on linear
block codes alone, at only 1\% additional code
overhead.",
acknowledgement = ack-nhfb,
affiliation = "Jian, X (Reprint Author), Univ Illinois, Urbana, IL
USA. Jian, Xun; Sartori, John; Duwe, Henry; Kumar,
Rakesh, Univ Illinois, Urbana, IL USA.",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "block codes; chipkill correct; chipkill correct memory
organization; code overhead reduction; Computer
architecture; device level fault; DRAM; DRAM chips;
error correction; error correction codes; fault mode;
fault tolerant computing; granular computing;
granularity access; linear block code; linear codes;
low power; Low power electronics; PARSEC; Random access
memory; rank size; reliable memory; server power
consumption; Servers; SPEC; storage management",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Jian:2013:HPE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Maddah:2013:DDS,
author = "Rakan Maddah and Sangyeun Cho and Rami Melhem",
title = "Data Dependent Sparing to Manage Better-Than-Bad
Blocks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "43--46",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We forecast that proper handling of unreliable storage
blocks (e.g., ``bad block management'' in solid-state
drives) will remain critical for future systems built
with advanced and emerging memory technologies. This
paper argues that the conventional block retirement and
sparing approach --- a block is retired as soon as it
shows faulty behavior --- is overly conservative and
inefficient. We observe that it is highly unlikely that
all faulty bits in a storage block manifest errors.
Consequently, we propose data dependent sparing, a
relaxed block retirement and sparing approach that
recycles faulty storage blocks. At small management
cost and with less than 1\% sparing, data dependent
sparing achieves the same lifetime as the conventional
approach with 20\% sparing.",
acknowledgement = ack-nhfb,
affiliation = "Maddah, R (Reprint Author), Univ Pittsburgh, Dept Comp
Sci, Pittsburgh, PA 15260 USA. Maddah, Rakan; Cho,
Sangyeun; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci,
Pittsburgh, PA 15260 USA.",
author-email = "rmaddah@cs.pitt.edu cho@cs.pitt.edu
melhem@cs.pitt.edu",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-1064976, CCF-1059283,
CNS-1012070]",
funding-text = "This work is supported in part by NSF grants
CCF-1064976, CCF-1059283, and CNS-1012070.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "better-than-bad block management; data dependent
sparing; data dependent sparing approach; Data storage
systems; fault tolerant computing; faulty bits; faulty
storage blocks; flash memory; Flash memory; flash
memory; management cost; memory technologies; phase
change memories; phase-change memory; phase-change
memory (PCM); relaxed block retirement approach;
solid-state drive; solid-state drive (SSD); Solid-state
drives; solid-state drives; Sparing; sparing; storage
block; storage management; stuck-at faults; unreliable
storage block handling",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Maddah:2013:DDS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2013:CFC,
author = "Hanjoon Kim and Yonggon Kim and John Kim",
title = "Clumsy Flow Control for High-Throughput Bufferless
On-Chip Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "47--50",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Bufferless on-chip networks are an alternative type of
on-chip network organization that can improve the
cost-efficiency of an on-chip network by removing
router input buffers. However, bufferless on-chip
network performance degrades at high load because of
the increased network contention and large number of
deflected packets. The energy benefit of bufferless
network is also reduced because of the increased
deflection. In this work, we propose a novel flow
control for bufferless on-chip networks in
high-throughput manycore accelerator architectures to
reduce the impact of deflection routing. By using a
clumsy flow control (CFC), instead of the per-hop flow
control that is commonly used in buffered on-chip
networks, we are able to reduce the amount of
deflection by up to 92\% on high-throughput workloads.
As a result, on average, CFC can approximately match
the performance of a baseline buffered router while
reducing the energy consumption by approximately
39\%.",
acknowledgement = ack-nhfb,
affiliation = "Kim, H (Reprint Author), Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon, South Korea. Kim,
Hanjoon; Kim, Yonggon; Kim, John, Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon, South Korea.",
author-email = "hanj@kaist.ac.kr ilios@kaist.ac.kr jjk12@kaist.ac.kr",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "MKE, Korea, under the ITRC
[NIPA-2012-H0301-12-1011]; BST program through the NRF
of Korea; MEST [2012-0003579]",
funding-text = "This research was supported in part by the MKE, Korea,
under the ITRC support program supervised by the NIPA
(NIPA-2012-H0301-12-1011) and in part by BST program
through the NRF of Korea funded by the
MEST(2012-0003579).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bufferless NoC; bufferless router; CFC; clumsy flow
control; computer architecture; Computer architecture;
Computer Systems Organization; cost-efficiency
improvement; Data processing; deflection routing;
deflection routing impact reduction; energy benefit;
energy consumption reduction; flow control;
high-throughput bufferless on-chip networks;
high-throughput manycore accelerator architectures;
high-throughput workloads; Interconnection
architectures; microprocessor chips; Multiple Data
Stream Architectures (Multiprocessors); Multiprocessing
systems; network contention; network routing;
network-on-chip; On-chip interconnection networks;
on-chip network organization; on-chip networks;
Parallel architectures; Parallel Architectures;
performance evaluation; Processor Architectures; router
input buffer removal; System-on-chip",
number-of-cited-references = "14",
research-areas = "Computer Science",
researcherid-numbers = "Kim, John/C-1792-2011",
times-cited = "7",
unique-id = "Kim:2013:CFC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kai:2013:GRP,
author = "Yi Kai and Yi Wang and Bin Liu",
title = "{GreenRouter}: Reducing Power by Innovating Router's
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "51--54",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "High speed routers in Internet are becoming more
powerful, as well as more energy hungry. In this paper,
we present a new architecture of router named
GreenRouter which separates a line-card into two parts:
network interface card (DB) and packet processing card
(MB), connected by a two-stage switch fabric in traffic
flows' ingress and egress direction respectively.
Traffic from all DBs shares all the MBs in GreenRouter,
thus can be aggregated to a few active MBs on demand
and other MBs can be shut down to save power. Several
key issues to this new architecture are addressed. We
evaluate the power saving efficiency and give
preliminary simulation results. GreenRouter can well
adapt the traffic fluctuation and real trace
evaluations over one week shows that up to 63.7\% power
saving can be achieved while QoS constraints are
guaranteed.",
acknowledgement = ack-nhfb,
affiliation = "Liu, B (Reprint Author), Tsinghua Univ, Dept Comp Sci
\& Technol, Beijing 100084, Peoples R China. Kai, Yi;
Wang, Yi; Liu, Bin, Tsinghua Univ, Dept Comp Sci \&
Technol, Beijing 100084, Peoples R China.",
author-email = "kaiyi02@gmail.com pig020623@gmail.com
lmyujie@gmail.com",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSFC [61073171]; Tsinghua University
Initiative Scientific Research Program [20121080068];
Specialized Research Fund for the Doctoral Program of
Higher Education of China [20100002110051]",
funding-text = "This work is supported by NSFC (61073171), Tsinghua
University Initiative Scientific Research Program
(20121080068), Specialized Research Fund for the
Doctoral Program of Higher Education of China
(20100002110051).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; DB; Energy efficiency;
energy-aware system; green computing; Green design;
GreenRouter; High-speed networks; Internet; line-card;
low power design; MB; network interface card; packet
processing card; power reduction; power saving
efficiency; QoS constraints; router; router
architecture innovation; Routers; telecommunication
network routing; Telecommunication traffic;
telecommunication traffic; traffic flow egress
direction; traffic flow ingress direction; traffic
fluctuation; two-stage switch fabric",
number-of-cited-references = "6",
ORCID-numbers = "Wang, Yi/0000-0002-9095-6879",
research-areas = "Computer Science",
researcherid-numbers = "Wang, Yi/A-8884-2015",
times-cited = "1",
unique-id = "Kai:2013:GRP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Joo:2013:HPS,
author = "Yongsoo Joo and Sangsoo Park",
title = "A Hybrid {PRAM} and {STT--RAM} Cache Architecture for
Extending the Lifetime of {PRAM} Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "55--58",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "To extend the lifetime of phase change RAM (PRAM)
caches, we propose a hybrid cache architecture that
integrates a relatively small capacity of spin transfer
torque RAM (STT--RAM) write buffer with a PRAM cache.
Our hybrid cache improves the endurance limitation of
the PRAM cache by judiciously redirecting the write
traffic from an upper memory layer to the STT--RAM
write buffer. We have demonstrated through simulation
that the proposed hybrid cache outperforms existing
write-traffic reduction schemes with the same area
overhead. Moreover, our approach is orthogonal to the
existing schemes, providing an effective way of
investing die area for cache lifetime extension by
being used in combination with them.",
acknowledgement = ack-nhfb,
affiliation = "Joo, Y (Reprint Author), Ewha Womans Univ, Dept Comp
Sci \& Engn, Seoul 120750, South Korea. Joo, Yongsoo;
Park, Sangsoo, Ewha Womans Univ, Dept Comp Sci \& Engn,
Seoul 120750, South Korea.",
author-email = "ysjoo@ewha.ac.kr sangsoo.park@ewha.ac.kr",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Ewha Womans University",
funding-text = "We thank Guangyu Sun and Cong Xu for their helpful
comments on NVRAM characteristics. This research was
supported by RP-Grant 2010 of Ewha Womans University.
Sangsoo Park is the corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache; cache lifetime extension; Cache memories; Cache
storage; cache storage; Computer architecture;
concurrency theory; Design Styles; endurance; Fault
tolerance; Hardware; hybrid cache architecture; hybrid
PRAM caches; investing die area; lifetime; memory
layer; Memory Structures; phase change memories; phase
change RAM; PRAM; Random access memory; Redundancy;
Redundant design; Reliability; spin transfer torque
RAM; STT RAM cache architecture; STT RAM write buffer;
STT--RAM; Testing and Fault-Tolerance; write traffic
reduction schemes",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Joo:2013:HPS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Blem:2013:MMA,
author = "Emily Blem and Hadi Esmaeilzadeh and Renee St Amant
and Karthikeyan Sankaralingam and Doug Burger",
title = "Multicore Model from Abstract Single Core Inputs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "59--62",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes a first order multicore model to
project a tighter upper bound on performance than
previous Amdahl's Law based approaches. The speedup
over a known baseline is a function of the core
performance, microarchitectural features, application
parameters, chip organization, and multicore topology.
The model is flexible enough to consider both CPU and
GPU like organizations as well as modern topologies
from symmetric to aggressive heterogeneous (asymmetric,
dynamic, and fused) designs. This extended model
incorporates first order effects-exposing more
bottlenecks than previous applications of Amdahl's
Law-while remaining simple and flexible enough to be
adapted for many applications.",
acknowledgement = ack-nhfb,
affiliation = "Blem, E (Reprint Author), Univ Wisconsin, Madison, WI
53706 USA. Blem, Emily; Sankaralingam, Karthikeyan,
Univ Wisconsin, Madison, WI 53706 USA. Esmaeilzadeh,
Hadi, Univ Washington, Seattle, WA 98195 USA. St Amant,
Renee, Univ Texas Austin, Austin, TX 78712 USA.",
author-email = "blem@cs.wisc.edu hadianeh@cs.washington.edu
stamant@cs.utexas.edu karu@cs.wisc.edu
dburger@microsoft.com",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstract single core inputs; aggressive heterogeneous
designs; Amdahl law based approach; application
parameters; chip organization; Computer Systems
Organization; CPU like organizations; first order
multicore model; General; GPU like organizations;
graphics processing units; microarchitectural features;
Modeling of computer architecture; multicore topology;
multicores; Multiple Data Stream Architectures
(Multiprocessors); multiprocessing systems; network
topology; parallelism; performance evaluation;
Performance modeling; Processor Architectures",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Blem:2013:MMA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Michaud:2013:DMT,
author = "Pierre Michaud",
title = "Demystifying Multicore Throughput Metrics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "63--66",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Several different metrics have been proposed for
quantifying the throughput of multicore processors.
There is no clear consensus about which metric should
be used. Some studies even use several throughput
metrics. We show that there exists a relation between
single-thread average performance metrics and
throughput metrics, and that throughput metrics inherit
the meaning or lack of meaning of the corresponding
single-thread metric. We show that two popular
throughput metrics, the weighted speedup and the
harmonic mean of speedups, are inconsistent: they do
not give equal importance to all benchmarks. Moreover
we demonstrate that the weighted speedup favors
unfairness. We show that the harmonic mean of IPCs, a
seldom used throughput metric, is actually consistent
and has a physical meaning. We explain under which
conditions the arithmetic mean or the harmonic mean of
IPCs can be used as a strong indicator of throughput
increase.",
acknowledgement = ack-nhfb,
affiliation = "Michaud, P (Reprint Author), INRIA Rennes, Rennes,
France. INRIA Rennes, Rennes, France.",
author-email = "Pierre.Michaud@inria.fr",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Computer Systems Organization;
evaluation; Measurement; Modeling; modeling;
Multi-core/single-chip multiprocessors; Multicore
processing; multicore processors; multicore throughput;
multicore throughput metrics; multiprocessing systems;
Parallel Architectures; Parallel architectures;
Performance evaluation; performance metric; Performance
of Systems; Processor Architectures; Program
processors; simulation of multiple-processor systems;
single thread metric; software metrics",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Michaud:2013:DMT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tembey:2013:SSS,
author = "Priyanka Tembey and Augusto Vega and Alper
Buyuktosunoglu and Dilma {Da Silva} and Pradip Bose",
title = "{SMT} Switch: Software Mechanisms for Power Shifting",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "67--70",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Simultaneous multithreading (SMT) as a processor
design to achieve higher levels of system and
application throughput is a well-accepted and deployed
technique in most desktop and server processors. We
study the power implications of varying SMT levels
i.e., thread counts per core for various multi-threaded
applications on a real SMT multicore platform, and
introduce a novel software mechanism of changing SMT
level of a core to tune platform power. Power-shifting
policies by varying per core SMT levels for performance
benefits within a power cap are introduced. Projected
power savings (of 15\%) for a streaming parallel
benchmark can be attained using SMT-level power
shifting mechanisms.",
acknowledgement = ack-nhfb,
affiliation = "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA
30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA
30332 USA.",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application throughput; Computer architecture;
Computer Systems Organization; Hardware;
multi-threading; Multicore platforms; multiprocessing
systems; Multithreaded processors; Multithreading;
Operating Systems; Other Architecture Styles; Parallel
processing; power aware computing; Power Management;
Power shifting; Power system management; Process
Management; Processor Architectures; processor design;
Program processors; Scheduling; simultaneous
multithreading; SMT; SMT multicore platform; SMT
switch; SMT-level power shifting mechanism; Software
engineering; software mechanisms; Software/Software
Engineering; streaming parallel benchmark; tune
platform power",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Tembey:2013:SSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2013:IOAb,
author = "Anonymous",
title = "{IEEE} Open Access Publishing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "71--71",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.33",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:SCI,
author = "Anonymous",
title = "Stay Connected to the {IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "72--72",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.34",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:BC,
author = "Anonymous",
title = "[{Back} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:BIC,
author = "Anonymous",
title = "[{Back} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:FC,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:FIC,
author = "Anonymous",
title = "[{Front} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Arelakis:2014:CVA,
author = "Angelos Arelakis and Per Stenstr{\"o}m",
title = "A Case for a Value-Aware Cache",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Replication of values causes poor utilization of
on-chip cache memory resources. This paper addresses
the question: How much cache resources can be
theoretically and practically saved if value
replication is eliminated? We introduce the concept of
value-aware caches and show that a sixteen times
smaller value-aware cache can yield the same miss rate
as a conventional cache. We then make a case for a
value-aware cache design using Huffman-based
compression. Since the value set is rather stable
across the execution of an application, one can afford
to reconstruct the coding tree in software. The
decompression latency is kept short by our proposed
novel pipelined Huffman decoder that uses canonical
codewords. While the (loose) upper-bound compression
factor is 5.2X, we show that, by eliminating
cache-block alignment restrictions, it is possible to
achieve a compression factor of 3.4X for practical
designs.",
acknowledgement = ack-nhfb,
affiliation = "Arelakis, A (Reprint Author), Chalmers, Gothenburg,
Sweden. Arelakis, Angelos; Stenstrom, Per, Chalmers,
Gothenburg, Sweden.",
author-email = "angelos@chalmers.se per.stenstrom@chalmers.se",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Swedish Research Council",
funding-text = "This research is supported by the Swedish Research
Council. The simulations ran on the resources provided
by the Swedish National Infrastructure for Computing
(SNIC) at C3SE.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.b Cache memories; cache storage;
cache-block alignment restriction elimination; Clocks;
coding tree reconstruction; data compression; data
handling; Decoding; decompression latency; E Data; E.4
Coding and Information Theory; E.4.a Data compaction
and compression; Engines; Huffman codes; Huffman
coding; Huffman-based compression; Indexes; on-chip
cache memory resources; System-on-a-chip; tree codes;
value replication; value-aware cache design",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Arelakis:2014:CVA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chen:2014:PEC,
author = "Zheng Chen and Huaxi Gu and Yintang Yang and Luying
Bai and Hui Li",
title = "A Power Efficient and Compact Optical Interconnect for
Network-on-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Optical interconnect is a promising alternative to
substitute the electrical interconnect for intra-chip
communications. The topology of optical Network-on-Chip
(ONoC) has a great impact on the network performance.
However, the size of ONoC is limited by the power
consumption and crosstalk noise, which are mainly
resulted from the waveguide crossings in the topology.
In this paper, a diagonal Mesh topology (DMesh) is
proposed to relieve the limitation of scalability by
reducing the number of waveguide crossing, which is
only 20\% that of Mesh. In addition, the number of
optical routers in DMesh is less than half of that in
Mesh-based ONoC. Due to its compact architecture and
favorable scalability, DMesh topology is suitable for
large-scale ONoC design.",
acknowledgement = ack-nhfb,
affiliation = "Chen, Z (Reprint Author), Xidian Univ Xian, State Key
Lab Integrated Serv Networks, Xian, Peoples R China.
Chen, Zheng; Gu, Huaxi; Bai, Luying; Li, Hui, Xidian
Univ Xian, State Key Lab Integrated Serv Networks,
Xian, Peoples R China. Yang, Yintang, Xidian Univ Xian,
Inst Microelect, Xian, Peoples R China.",
author-email = "chenzheng8331@stu.xidian.edu.cn hxgu@xidian.edu.cn
ytyang@xidian.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation of China
[61070046, 60803038]; State Key Lab [ISN1104001];
Fundamental Research Funds for the Central Universities
[K5051301003]; 111 Project [B08038]",
funding-text = "This work is supported by the National Science
Foundation of China Grant No. 61070046 and 60803038,
the special fund from State Key Lab Grant No.
ISN1104001, the Fundamental Research Funds for the
Central Universities Grant No. K5051301003, the 111
Project Grant No. B08038.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "compact optical interconnect; crosstalk noise;
diagonal mesh topology; DMesh topology; integrated
optoelectronics; intra-chip communications; large-scale
ONoC design; mesh-based ONoC; multiprocessors; network
performance; Network topology; network-on-chip; optical
interconnections; Optical interconnects; optical
network-on-chip; optical router; Optical routers;
optical routers; power consumption; power efficient
interconnect; Topology; topology; Topology; waveguide
crossings; wavelength division multiplexing; Wavelength
division multiplexing; wavelength division
multiplexing",
number-of-cited-references = "9",
ORCID-numbers = "Gu, Huaxi/0000-0002-6409-2229",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Chen:2014:PEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Cota:2014:AMR,
author = "Emilio G. Cota and Paolo Mantovani and Michele
Petracca and Mario R. Casu and Luca P. Carloni",
title = "Accelerator Memory Reuse in the Dark Silicon Era",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Accelerators integrated on-die with General-Purpose
CPUs (GP-CPUs) can yield significant performance and
power improvements. Their extensive use, however, is
ultimately limited by their area overhead; due to their
high degree of specialization, the opportunity cost of
investing die real estate on accelerators can become
prohibitive, especially for general-purpose
architectures. In this paper we present a novel
technique aimed at mitigating this opportunity cost by
allowing GP-CPU cores to reuse accelerator memory as a
non-uniform cache architecture (NUCA) substrate. On a
system with a last level-2 cache of 128kB, our
technique achieves on average a 25\% performance
improvement when reusing four 512 kB accelerator memory
blocks to form a level-3 cache. Making these blocks
reusable as NUCA slices incurs on average in a 1.89\%
area overhead with respect to equally-sized ad hoc
cache slices.",
acknowledgement = ack-nhfb,
affiliation = "Cota, EG (Reprint Author), Columbia Univ, New York, NY
10027 USA. Cota, Emilio G.; Mantovani, Paolo; Carloni,
Luca P., Columbia Univ, New York, NY 10027 USA.
Petracca, Michele, Cadence Design Syst Inc, San Jose,
CA USA. Casu, Mario R., Politecn Torino, Turin,
Italy.",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [1018236,
1219001]; ONR Young Investigator Award; Gigascale
Systems Research Center; Focus Center Research Program
(FCRP), a Semiconductor Research Corporation entity",
funding-text = "This research is partially supported by the National
Science Foundation under Awards \#: 1018236 and
1219001, an ONR Young Investigator Award, and the
Gigascale Systems Research Center, one of six research
centers funded under the Focus Center Research Program
(FCRP), a Semiconductor Research Corporation entity.
The authors thank John Demme and the anonymous
reviewers for their insightful comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerator architectures; Accelerator
architectures; accelerator architectures; accelerator
memory reuse; cache formation; Cache memory; cache
slice; cache storage; dark silicon era; general purpose
CPU; general-purpose architecture; GP-CPU; Memory
management; nonuniform cache architecture; NUCA
substrate; Power demand; Silicon; Transform coding",
keywords-plus = "CACHES",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Cota:2014:AMR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chou:2014:EPE,
author = "Yu-Liang Chou and Shaoshan Liu and Eui-Young Chung and
Jean-Luc Gaudiot",
title = "An Energy and Performance Efficient {DVFS} Scheme for
Irregular Parallel Divide-and-Conquer Algorithms on the
{Intel SCC}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The divide-and-conquer paradigm can be used to express
many computationally significant problems, but an
important subset of these applications is inherently
load-imbalanced. Load balancing is a challenge for
irregular parallel divide-and-conquer algorithms and
efficiently solving these applications will be a key
requirement for future many-core systems. To address
the load imbalance issue, instead of attempting to
dynamically balancing the workloads, this paper
proposes an energy and performance efficient Dynamic
Voltage and Frequency Scaling (DVFS) scheduling scheme,
which takes into account the load imbalance behavior
exhibited by these applications. More specifically, we
examine the core of the divide-and-conquer paradigm and
determine that the base-case-reached point where
recursion stops is a suitable place in a
divide-and-conquer paradigm to apply the proposed DVFS
scheme. To evaluate the proposed scheme, we implement
four representative irregular parallel
divide-and-conquer algorithms, tree traversal,
quicksort, finding primes, and n-queens puzzle, on the
Intel Single-chip Cloud Computer (SCC) many-core
machine. We demonstrate that, on average, the proposed
scheme can improve performance by 41\% while reducing
energy consumption by 36\% compared to the baseline
running the whole computation with the default
frequency configuration (400MHz).",
acknowledgement = ack-nhfb,
affiliation = "Chou, YL (Reprint Author), Univ Calif Irvine, Irvine,
CA 92697 USA. Chou, Yu-Liang; Gaudiot, Jean-Luc, Univ
Calif Irvine, Irvine, CA 92697 USA. Liu, Shaoshan,
Microsoft Corp, Redmond, WA 98052 USA. Chung,
Eui-Young, Yonsei Univ, Seoul 120749, South Korea.",
author-email = "d943010010@gmail.com shaoliu@microsoft.com
eychung@yonsei.ac.kr gaudiot@uci.edu",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[CCF-1065448]; National Research Foundation of Korea
(NRF) [2012S1A2A1A01031420]; Ministry of Education,
Science and Technology [2012-047670]; National Science
Council [NSC 101-2917-I-564-079]",
funding-text = "This work is partly supported by the US National
Science Foundation under Grant No. CCF-1065448, by the
National Research Foundation of Korea (NRF) under Grant
No. 2012S1A2A1A01031420, by the Ministry of Education,
Science and Technology under Grant No. 2012-047670, and
by the National Science Council under Grant No. NSC
101-2917-I-564-079. Any opinions, findings, and
conclusions expressed in this material are those of the
authors and do not necessarily reflect the views of
these sponsors.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "base-case-reached point; D Software/Software
Engineering; D.4 Operating Systems; D.4 Operating
Systems < D.4.7 Organization and Design; D.4.7.b
Distributed systems; D.4.7.f Parallel systems; D.4.8
Performance < D.4.8.a Measurements < Distributed
processing; divide and conquer methods;
Divide-and-conquer; DVFS; dynamic voltage and frequency
scaling; energy conservation; energy consumption
reduction; energy efficient DVFS scheme; finding
primes; frequency 400 MHz; Intel SCC; Intel single-chip
cloud computer; irregular parallel divide-and-conquer
algorithms; Load Imbalance; load imbalance behavior;
many-core machine; microprocessor chips;
multiprocessing systems; n-queens puzzle; Operating
systems; parallel algorithms; Parallel processing;
performance efficient DVFS scheme; Performance
evaluation; power aware computing; processor
scheduling; quicksort; recursion stops; resource
allocation; Software engineering; tree traversal",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Chou:2014:EPE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rotem:2014:BUI,
author = "Nadav Rotem and Yosi {Ben Asher}",
title = "Block Unification {IF}-conversion for High Performance
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Graphics Processing Units accelerate data-parallel
graphic calculations using wide SIMD vector units.
Compiling programs to use the GPU's SIMD architectures
require converting multiple control flow paths into a
single stream of instructions. IF-conversion is a
compiler transformation, which converts control
dependencies into data dependencies, and it is used by
vectorizing compilers to eliminate control flow and
enable efficient code generation. In this work we
enhance the IF-conversion transformation by using a
block unification method to improve the currently used
block flattening method. Our experimental results
demonstrate that our IF-conversion method is effective
in reducing the number of predicated instructions and
in boosting kernel execution speed.",
acknowledgement = ack-nhfb,
affiliation = "Rotem, N (Reprint Author), Univ Haifa, Dept Comp Sci,
IL-31999 Haifa, Israel. Rotem, Nadav; Ben Asher, Yosi,
Univ Haifa, Dept Comp Sci, IL-31999 Haifa, Israel.",
author-email = "rotemn@cs.haifa.ac.il yosi@cs.haifa.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "block flattening method; block unification
IF-conversion; block unification method; code
generation; Code generation; compiler transformation;
Compilers; Computer architecture; data-parallel graphic
calculations; GPU SIMD architectures; Graphics
processing unit; graphics processing units; high
performance architectures; Kernel; Merging; multiple
control flow paths; parallel processing; Processors;
program compilers; Programming Languages; Registers;
Software/Software Engineering; vectorizing compilers;
Vectors; wide SIMD vector units",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Rotem:2014:BUI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ilic:2014:CAR,
author = "Aleksandar Ilic and Frederico Pratas and Leonel
Sousa",
title = "Cache-aware Roofline model: Upgrading the loft",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The Roofline model graphically represents the
attainable upper bound performance of a computer
architecture. This paper analyzes the original Roofline
model and proposes a novel approach to provide a more
insightful performance modeling of modern architectures
by introducing cache-awareness, thus significantly
improving the guidelines for application optimization.
The proposed model was experimentally verified for
different architectures by taking advantage of built-in
hardware counters with a curve fitness above 90\%.",
acknowledgement = ack-nhfb,
affiliation = "Ilic, A (Reprint Author), Univ Tecn Lisboa, INESC ID
IST, Lisbon, Portugal. Ilic, Aleksandar; Pratas,
Frederico; Sousa, Leonel, Univ Tecn Lisboa, INESC ID
IST, Lisbon, Portugal.",
author-email = "ilic@inesc-id.pt fcpp@inesc-id.pt las@inesc-id.pt",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "national funds through FCT (Fundacao para a
Ciencia e a Tecnologia) [PTDC/EEI-ELC/3152/2012,
PEst-OE/EEI/LA0021/2011, PTDC/EEA-ELC/117329/2010]; FCT
[SFRH/BPD/87734/2012]",
funding-text = "This work was supported by national funds through FCT
(Fundacao para a Ciencia e a Tecnologia), under
projects PTDC/EEI-ELC/3152/2012,
PEst-OE/EEI/LA0021/2011, and PTDC/EEA-ELC/117329/2010.
F. Pratas also acknowledges the FCT scholarship
SFRH/BPD/87734/2012.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application optimization; application optimization;
Application optimization; built-in hardware counters;
C.0.d Modeling of computer architecture < C.0 General <
C Computer Systems Organization; C.0.e System
architectures; C.4.d Modeling techniques < C.4
Performance of Systems < C Computer Systems
Organization; C.4.g Measurement; cache storage;
cache-aware Roofline model; cache-awareness; computer
architecture; computer architecture upper bound
performance; curve fitness; evaluation; integration and
modeling < C.0 General < C Computer Systems
Organization; Modeling; modeling; Multicore computer
architectures; Multiprocessing systems; multiprocessing
systems; Performance evaluation; Performance modeling;
Simulation; simulation of multiple-processor systems <
C.4 Performance of Systems < C Computer Syst",
number-of-cited-references = "10",
ORCID-numbers = "Ilic, Aleksandar/0000-0002-8594-3539 Sousa,
Leonel/0000-0002-8066-221X",
research-areas = "Computer Science",
researcherid-numbers = "Ilic, Aleksandar/L-1943-2014 Sousa,
Leonel/B-2749-2009",
times-cited = "24",
unique-id = "Ilic:2014:CAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Efraim:2014:EAR,
author = "Rotem Efraim and Ran Ginosar and C. Weiser and Avi
Mendelson",
title = "Energy Aware Race to Halt: a Down to {EARtH} Approach
for Platform Energy Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The EARtH algorithm finds the optimal voltage and
frequency operational point of the processor in order
to achieve minimum energy of the computing platform.
The algorithm is based on a theoretical model employing
a small number of parameters, which are extracted from
real systems using off-line and run-time methods. The
model and algorithm have been validated on real systems
using 45nm, 32nm and 22nm Intel (R) Core processors.
The algorithm can save up to 44\% energy compared with
the commonly used fixed frequency policies.",
acknowledgement = ack-nhfb,
affiliation = "Efraim, R (Reprint Author), Intel Corp, Santa Clara,
CA 95051 USA. Efraim, Rotem, Intel Corp, Santa Clara,
CA 95051 USA. Ginosar, Ran; Weiser, C.; Mendelson, Avi,
Technion Israeli Inst Technol, Haifa, Israel.",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; B Hardware; B.9 Power
Management; B.9.2 Energy-aware systems; C Computer
Systems Organization; C.4 Performance of Systems; C.5
Computer System Implementation; C.5.4 VLSI Systems;
C.5.5 Servers; Computational modeling; Earth; EARtH
algorithm; energy aware race to halt; Energy
management; Energy measurement; fixed frequency
policies; Frequency measurement; frequency operational
point; Heterogeneous cores; Intel core processors;
microprocessor chips; off-line methods; optimal
voltage; platform energy management; power aware
computing; Power Management; run-time methods; size 22
nm; size 32 nm; size 45 nm; Voltage measurement",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Efraim:2014:EAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Cakmakci:2014:EVA,
author = "Yaman {\c{C}}akmak{\c{c}}i and O{\u{g}}uz Ergin",
title = "Exploiting Virtual Addressing for Increasing
Reliability",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A novel method to protect a system against errors
resulting from soft errors occurring in the virtual
address (VA) storing structures such as translation
lookaside buffers (TLB), physical register file (PRF)
and the program counter (PC) is proposed in this paper.
The work is motivated by showing how soft errors impact
the structures that store virtual page numbers (VPN). A
solution is proposed by employing linear block encoding
methods to be used as a virtual addressing scheme at
link time. Using the encoding scheme to assign VPNs for
VAs, it is shown that the system can tolerate soft
errors using software with the help of the discussed
decoding techniques applied to the page fault handler.
The proposed solution can be used on all of the
architectures using virtually indexed addressing. The
main contribution of this paper is the decreasing of
AVF for data TLB by 42.5\%, instruction TLB by 40.3\%,
PC by 69.2\% and PRF by 33.3\%.",
acknowledgement = ack-nhfb,
affiliation = "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), TOBB Univ
Econ \& Technol, Dept Comp Engn, Ankara, Turkey.
{\c{C}}akmak{\c{c}}i, Yaman; Ergin, O{\u{g}}uz, TOBB
Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey.",
author-email = "ycakmakci@etu.edu.tr oergin@etu.edu.tr",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Scientific and Technological Research
Council of Turkey (TUBITAK) [112E004]",
funding-text = "This work was supported in part by the Scientific and
Technological Research Council of Turkey (TUBITAK)
under Grant 112E004. The work is in the framework of
COST ICT Action 1103 Manufacturable and Dependable
Multicore Architectures at Nanoscale.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AVF; B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.h Virtual memory; B.3.4 Reliability,
Testing and Fault-Tolerance; buffer storage; decoding
techniques; encoding; Fault tolerance; Hardware; linear
block encoding methods; Memory management; page fault
handler; PC; physical register file; PRF; program
counter; soft errors; TLB; translation lookaside
buffers; virtual address storing structures; virtual
addressing; virtual addressing scheme; Virtual memory;
virtual page numbers; virtually indexed addressing;
VPN",
keywords-plus = "SOFT ERRORS",
number-of-cited-references = "10",
ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787",
research-areas = "Computer Science",
researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010",
times-cited = "1",
unique-id = "Cakmakci:2014:EVA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhu:2014:EWC,
author = "Yuhao Zhu and Aditya Srikanth and Jingwen Leng and
Vijay Janapa Reddi",
title = "Exploiting Webpage Characteristics for
Energy-Efficient Mobile {Web} Browsing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.33",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Web browsing on mobile devices is undoubtedly the
future. However, with the increasing complexity of
webpages, the mobile device's computation capability
and energy consumption become major pitfalls for a
satisfactory user experience. In this paper, we propose
a mechanism to effectively leverage processor frequency
scaling in order to balance the performance and energy
consumption of mobile web browsing. This mechanism
explores the performance and energy tradeoff in webpage
loading, and schedules webpage loading according to the
webpages' characteristics, using the different
frequencies. The proposed solution achieves 20.3\%
energy saving compared to the performance mode, and
improves webpage loading performance by 37.1\% compared
to the battery saving mode.",
acknowledgement = ack-nhfb,
affiliation = "Zhu, YH (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Zhu, Yuhao;
Srikanth, Aditya; Leng, Jingwen; Reddi, Vijay Janapa,
Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX
78712 USA.",
author-email = "yzhu@utexas.edu aditya.srik@utexas.edu
jingwen@utexas.edu vj@ece.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C Computer Systems Organization; C.2
Communication/Networking and Information Technology;
C.2.8 Mobile Computing; Cascading style sheets; Cutoff;
EDP; Energy; energy conservation; energy consumption;
Energy consumption; energy-efficient mobile Web
browsing; HTML; Internet; Load modeling; Loading;
Market research; Mobile communication; mobile
computing; mobile device computation capability;
Performance; power aware computing; processor frequency
scaling; user experience; Web page characteristics; Web
page loading performance; Webpages",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Zhu:2014:EWC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Morad:2014:GMO,
author = "Amir Morad and Tomer Y. Morad and Leonid Yavits and
Ran Ginosar and Uri Weiser",
title = "Generalized {MultiAmdahl}: Optimization of
Heterogeneous Multi-Accelerator {SoC}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.34",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Consider a workload comprising a consecutive sequence
of program execution segments, where each segment can
either be executed on general purpose processor or
offloaded to a hardware accelerator. An analytical
optimization framework based on MultiAmdahl framework
and Lagrange multipliers, for selecting the optimal set
of accelerators and for allocating resources among them
under constrained area is proposed. Due to the
practical implementation of accelerators, the optimal
architecture under area constraints may exclude some of
the accelerators. As the fraction of the workload that
can be accelerated decreases, resources (e.g. area) may
shift from accelerators into the general purpose
processor. The framework can be extended in a number of
ways, spanning from SoC partitioning, bandwidth to
power distribution, energy and other constrained
resources.",
acknowledgement = ack-nhfb,
affiliation = "Morad, A (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Morad, Amir; Morad, Tomer Y.; Yavits, Leonid; Ginosar,
Ran; Weiser, Uri, Technion Israel Inst Technol, Dept
Elect Engn, IL-32000 Haifa, Israel.",
author-email = "amirm@tx.technion.ac.il tomerm@tx.technion.ac.il
yavits@tx.technion.ac.il ran@ee.technion.ac.il
uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; analytical optimization framework; Chip
Multiprocessors; general purpose processor; generalized
multiAmdhal framework; Hardware; hardware accelerator;
heterogeneous multiaccelerator SoC partitioning;
Lagrange multiplier; Mathematical model; Modeling of
computer architecture; MultiAmdahl; Multicore
processing; optimisation; Optimization; power
distribution bandwidth; program execution segment;
resource allocation; Resource management;
System-on-a-chip; system-on-chip",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Morad:2014:GMO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kvatinsky:2014:MBM,
author = "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion
and Eby G. Friedman and Avinoam Kolodny and Uri C.
Weiser",
title = "Memristor-Based Multithreading",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "41--44",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Switch on Event Multithreading (SoE MT, also known as
coarse-grained MT and block MT) processors run multiple
threads on a pipeline machine, while the pipeline
switches threads on stall events (e.g., cache miss).
The thread switch penalty is determined by the number
of stages in the pipeline that are flushed of in-flight
instructions. In this paper, Continuous Flow
Multithreading (CFMT), a new architecture of SoE MT, is
introduced. In CFMT, a multistate pipeline register
(MPR) holds the microarchitectural state of multiple
different threads within the execution pipeline stages,
where only one thread is active at a time. The MPRs
eliminate the need to flush in-flight instructions and
therefore significantly improve performance. In recent
years, novel memory technologies such as Resistive RAM
(RRAM) and Spin Torque Transfer Magnetoresistive RAM
(STT-MRAM), have been developed. All of these
technologies are nonvolatile, store data as resistance,
and can be described as ``memristors''. Memristors are
power efficient, dense, and fast as compared to
standard memory technologies such as SRAM, DRAM, and
Flash. Memristors therefore provide the opportunity to
place the MPRs physically within the pipeline stages. A
performance analysis of CFMT is compared to
conventional SoE MT processors, demonstrating up to a
2X performance improvement, while the operational
mechanism, due to the use of memristors, is low power
and low complexity as compared to conventional SoE MT
processors.",
acknowledgement = ack-nhfb,
affiliation = "Kvatinsky, S (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam;
Weiser, Uri C., Technion Israel Inst Technol, Dept
Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav,
Technion Israel Inst Technol, Dept Comp Sci, IL-32000
Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept
Elect \& Comp Engn, Rochester, NY 14627 USA.",
author-email = "skva@tx.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Hasso Plattner Institute",
funding-text = "This work was supported by the Hasso Plattner
Institute. The authors thank Ravi Patel for his
comments and area overhead estimation and to Nimrod
Wald and Guy Satat for their help in evaluating the
architecture.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.7 Integrated
Circuits; B.7.1 Types and Design Styles; B.7.1.e Memory
technologies; C Computer Systems Organization; C.0
General; C.0.a Emerging technologies; C.0.d Modeling of
computer architecture; CFMT; Computer architecture;
continuous flow multithreading; in-flight instructions;
Integrated circuits; Memory management; memristor;
memristor-based multithreading; memristors; MPR;
multi-threading; multistate pipeline register;
multithreaded processors; Multithreading; novel memory
technologies; phase change memory; random-access
storage; resistive RAM; RRAM; RRAM, STT-MRAM; SoE MT
processors; spin torque transfer magnetoresistive RAM;
STT- MRAM; STT-MRAM; switch on event multithreading
processors; Systems design and analysis",
keywords-plus = "RESISTIVE SWITCHING MEMORIES",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Kvatinsky:2014:MBM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wingbermuehle:2014:OAS,
author = "Joseph G. Wingbermuehle and Ron K. Cytron and Roger D.
Chamberlain",
title = "Optimization of Application-Specific Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "45--48",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory access times are the primary bottleneck for
many applications today. This ``memory wall'' is due to
the performance disparity between processor cores and
main memory. To address the performance gap, we propose
the use of custom memory subsystems tailored to the
application rather than attempting to optimize the
application for a fixed memory subsystem. Custom
subsystems can take advantage of application-specific
properties as well as memory-specific properties to
improve access times or write-backs given constraints
on size or power.",
acknowledgement = ack-nhfb,
affiliation = "Wingbermuehle, JG (Reprint Author), Washington Univ,
Dept Comp Sci \& Engn, St Louis, MO 63130 USA.
Wingbermuehle, Joseph G.; Cytron, Ron K.; Chamberlain,
Roger D., Washington Univ, Dept Comp Sci \& Engn, St
Louis, MO 63130 USA.",
author-email = "wingbej@wustl.edu cytron@wustl.edu roger@wustl.edu",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CNS-09095368,
CNS-0931693]",
funding-text = "This work is supported by the National Science
Foundation under grants CNS-09095368 and CNS-0931693.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access time improvement; application-specific memory
optimization; B Hardware; B.3 Memory Structures; B.3.2
Design Styles; B.3.3 Performance Analysis and Design
Aids; B.3.3.b Simulation; C Computer Systems
Organization; C.1 Processor Architectures; C.1.5
Micro-architecture implementation considerations;
C.1.5.e Memory hierarchy; cache; cache storage;
Computer architecture; custom memory subsystems; fixed
memory subsystem; Hardware; memory access times; Memory
management; memory wall; memory-specific properties;
Multiprocessing systems; performance disparity;
Performance evaluation; performance gap; processor
cores; write-backs given constraints",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wingbermuehle:2014:OAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xu:2014:STM,
author = "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao
Li and Depei Qian",
title = "Software Transactional Memory for {GPU}
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "49--52",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To make applications with dynamic data sharing among
threads benefit from GPU acceleration, we propose a
novel software transactional memory system for GPU
architectures (GPU-STM). The major challenges include
ensuring good scalability with respect to the massively
multithreading of GPUs, and preventing livelocks caused
by the SIMT execution paradigm of GPUs. To this end, we
propose (1) a hierarchical validation technique and (2)
an encounter-time lock-sorting mechanism to deal with
the two challenges, respectively. Evaluation shows that
GPU-STM outperforms coarse-grain locks on GPUs by up to
20x.",
acknowledgement = ack-nhfb,
affiliation = "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp
Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li,
Tao, Univ Florida, ECE Dept, Gainesville, FL USA.",
author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn
nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF of China [61133004, 61128004,
61073011]; 863 Program of China [2012AA010902]",
funding-text = "This work is supported by NSF of China under grant
61133004, 61128004 and 61073011, and 863 Program of
China under grant 2012AA010902.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "dynamic data sharing; encounter-time lock-sorting
mechanism; GPU acceleration; GPU architectures;
GPU-STM; graphics processing units; hierarchical
validation technique; multi-threading; Multicore
processing; multicore processor; Multicore Processors;
multiprocessing systems; Multiprocessing systems;
multithreading; parallel architectures; Parallel
processing; Parallel Programming; parallel programming;
Parallel Programming; Run-time Environments; Runtime
environment; SIMD processor; SIMD Processors; SIMT
execution paradigm; software transactional memory
system; sorting",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Xu:2014:STM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Shim:2014:TMP,
author = "Keun Sup Shim and Mieszko Lis and Omer Khan and
Srinivas Devadas",
title = "Thread Migration Prediction for Distributed Shared
Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "53--56",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Chip-multiprocessors (CMPs) have become the mainstream
parallel architecture in recent years; for scalability
reasons, designs with high core counts tend towards
tiled CMPs with physically distributed shared caches.
This naturally leads to a Non-Uniform Cache Access
(NUCA) design, where on-chip access latencies depend on
the physical distances between requesting cores and
home cores where the data is cached. Improving data
locality is thus key to performance, and several
studies have addressed this problem using data
replication and data migration. In this paper, we
consider another mechanism, hardware-level thread
migration. This approach, we argue, can better exploit
shared data locality for NUCA designs by effectively
replacing multiple round-trip remote cache accesses
with a smaller number of migrations. High migration
costs, however, make it crucial to use thread
migrations judiciously; we therefore propose a novel,
on-line prediction scheme which decides whether to
perform a remote access (as in traditional NUCA
designs) or to perform a thread migration at the
instruction level. For a set of parallel benchmarks,
our thread migration predictor improves the performance
by 24\% on average over the shared-NUCA design that
only uses remote accesses.",
acknowledgement = ack-nhfb,
affiliation = "Shim, KS (Reprint Author), MIT, 77 Massachusetts Ave,
Cambridge, MA 02139 USA. Shim, Keun Sup; Lis, Mieszko;
Devadas, Srinivas, MIT, Cambridge, MA 02139 USA. Khan,
Omer, Univ Connecticut, Storrs, CT USA.",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.g Shared memory; Benchmark testing; C
Computer Systems Organization; C.1 Processor
Architectures; C.1.4 Parallel Architectures; Cache
Coherence; cache storage; chip-multiprocessors; CMPs;
Coherence; Computer architecture; Context; core counts;
Data Locality; data locality improvement; data
migration; data replication; Distributed Caches;
hardware-level thread migration prediction; home cores;
Instruction sets; integrated circuit design; mainstream
parallel architecture; microprocessor chips;
multiprocessing systems; nonuniform cache access
design; on-chip access latencies; online prediction
scheme; Parallel Architecture; parallel architectures;
physical distributed shared caches; Protocols;
Registers; requesting cores; shared-NUCA design",
number-of-cited-references = "13",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Shim:2014:TMP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2014:TCa,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C1--C4",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360655",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ITPa,
author = "Anonymous",
title = "{{\booktitle{IEEE Transactions on Pattern Analysis and
Machine Intelligence}} Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C2--C2",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360656",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ITPb,
author = "Anonymous",
title = "{{\booktitle{IEEE Transactions on Pattern Analysis and
Machine Intelligence}}} Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C3--C3",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360657",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C4--C4",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360658",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lavasani:2014:FBL,
author = "Maysam Lavasani and Hari Angepat and Derek Chiou",
title = "An {FPGA}-based In-Line Accelerator for {Memcached}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present a method for accelerating server
applications using a hybrid CPU+FPGA architecture and
demonstrate its advantages by accelerating Memcached, a
distributed key-value system. The accelerator,
implemented on the FPGA fabric, processes request
packets directly from the network, avoiding the CPU in
most cases. The accelerator is created by profiling the
application to determine the most commonly executed
trace of basic blocks which are then extracted. Traces
are executed speculatively within the FPGA. If the
control flow exits the trace prematurely, the side
effects of the computation are rolled back and the
request packet is passed to the CPU. When compared to
the best reported software numbers, the Memcached
accelerator is 9.15x more energy efficient for common
case requests.",
acknowledgement = ack-nhfb,
affiliation = "Lavasani, M (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Lavasani,
Maysam; Angepat, Hari; Chiou, Derek, Univ Texas Austin,
Dept Elect \& Comp Engn, Austin, TX 78712 USA.",
author-email = "maysamlavasani@utexas.edu hangepat@utexas.edu
derek@utexas.edu",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerating server; C.1.3.f Heterogeneous (hybrid)
systems; C.2.4.a Client/server; cache storage;
Client-server systems; Computer architecture; control
flow; distributed key-value system; distributed
processing; field programmable gate arrays; Field
programmable gate arrays; FPGA-based in-line
accelerator; hybrid CPU+FPGA architecture; Hybrid
systems; Memcached accelerator; Program processors;
reconfigurable architectures; request packet; rolled
back; software numbers",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "24",
unique-id = "Lavasani:2014:FBL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Song:2014:AFB,
author = "Xiang Song and Jian Yang and Haibo Chen",
title = "Architecting Flash-based Solid-State Drive for
High-performance {I/O} Virtualization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Flash-based solid-state drive (SSD) is now being
widely deployed in cloud computing platforms due to the
potential advantages of better performance and less
energy consumption. However, current virtualization
architecture lacks support for high-performance I/O
virtualization over persistent storage, which results
in sub-optimal I/O performance for guest virtual
machines (VMs) on SSD. Further, current software-based
I/O virtualization violates the ``don't hide power''
principle due to inefficient support for some advanced
SSD commands (e.g., TRIM) and constrained parallelism,
leading to sub-optimal performance and life cycle. This
paper observes that the massive internal parallelism
and the block emulation in the flash translation layer
(FTL) make flash-based SSD an ideal candidate to
support high-performance I/O virtualization for
persistent storage. Based on this observation, we
propose VFlash, the first storage I/O virtualization
architecture that extends existing SSDs with trivial
hardware changes to directly expose multiple virtual
SSDs to guest VMs. Performance evaluation using a
modified FlashSim with two FTL schemes (i.e., DFTL and
FAST) shows that VFlash incurs only small performance
overhead over native SSDs and can efficiently exploit
parallelism.",
acknowledgement = ack-nhfb,
affiliation = "Chen, HB (Reprint Author), Shanghai Jiao Tong Univ,
Sch Software, Inst Parallel \& Distributed Syst,
Shanghai 200030, Peoples R China. Song, Xiang; Yang,
Jian; Chen, Haibo, Shanghai Jiao Tong Univ, Sch
Software, Inst Parallel \& Distributed Syst, Shanghai
200030, Peoples R China.",
author-email = "haibochen@sjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "China National Natural Science Foundation
[61003002]; Intel",
funding-text = "This work was supported by China National Natural
Science Foundation under grant numbered 61003002 and a
grant from Intel.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B.4.4 Performance Analysis and Design Aids; C.4.g
Measurement; cloud computing; Cloud computing; cloud
computing platforms; Computer architecture; energy
consumption; evaluation; flash memories; flash-based
solid-state drive; high performance I/O virtualization
architecture; I/O virtualization; modeling;
Multiprocessing systems; Parallel processing;
Performance evaluation; performance evaluation; Random
access memory; simulation of multiple-processor
systems; software-based I/O virtualization; Solid state
circuits; Solid State Drive; SSD commands; virtual
machines; virtualisation; VM",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Song:2014:AFB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wu:2014:ATE,
author = "Carole-Jean Wu",
title = "Architectural Thermal Energy Harvesting Opportunities
for Sustainable Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "65--68",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Increased power dissipation in computing devices has
led to a sharp rise in thermal hotspots, creating
thermal runaway. To reduce the additional power
requirement caused by increased temperature, current
approaches apply cooling mechanisms to remove heat or
apply management techniques to avoid thermal
emergencies by slowing down heat generation. This paper
proposes to tackle the heat management problem of
computing platforms with a fundamentally new approach -
instead of heat removal using cooling mechanisms and
heat avoidance using dynamic thermal/power management
techniques, this work investigates the mechanisms to
recover wasted heat into reusable energy for
sustainable computing. Through recent advancements in
thermoelectric materials, we allow wasted heat energy
generated by computing devices to be recovered,
transformed, and harvested as electricity that can be
directly used within the system. We demonstrate a
real-system setup where we recover 0.3 to 1 watt of
power with the CPU running at 70 to 105 degrees C,
using a COTS thermoelectric device on top of the CPU.
Through this research, we hope to motivate more
in-depth efforts to explore heat energy harvesting
opportunities on computing devices and inspire
plausible solutions to overcome the technical
challenges discussed in this paper.",
acknowledgement = ack-nhfb,
affiliation = "Wu, CJ (Reprint Author), Arizona State Univ, Sch Comp,
Dept Comp Sci Engn, Tempe, AZ 85281 USA. Arizona State
Univ, Sch Comp, Dept Comp Sci Engn, Tempe, AZ 85281
USA.",
author-email = "carole-jean.wu@asu.edu",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural thermal energy harvesting; cooling;
Cooling; cooling mechanisms; dynamic thermal-power
management technique; Energy conservation; energy
harvesting; Energy-aware systems; heat generation; heat
management problem; power dissipation; Power
distribution; power engineering computing; Resistance
heating; sustainable computing; Temperature
measurement; Temperature-aware design; thermal energy
storage; thermal runaway; Waste heat",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Wu:2014:ATE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yavits:2014:CHO,
author = "Leonid Yavits and Amir Morad and Ran Ginosar",
title = "Cache Hierarchy Optimization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "69--72",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power consumption, off-chip memory bandwidth, chip
area and Network on Chip (NoC) capacity are among main
chip resources limiting the scalability of Chip
Multiprocessors (CMP). A closed form analytical
solution for optimizing the CMP cache hierarchy and
optimally allocating area among hierarchy levels under
such constrained resources is developed. The
optimization framework is extended by incorporating the
impact of data sharing on cache miss rate. An
analytical model for cache access time as a function of
cache size is proposed and verified using CACTI
simulation.",
acknowledgement = ack-nhfb,
affiliation = "Yavits, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Yavits, Leonid; Morad, Amir; Ginosar, Ran, Technion
Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
Israel.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "ICRI-CI; Hasso-Plattner-Institut",
funding-text = "We thank Prof. Uri Weiser and Yaniv Ben Itzhak for
their review and remarks. This research was partially
funded by the ICRI-CI and Hasso-Plattner-Institut.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Analytical Performance Models;
Bandwidth; Cache Hierarchy; cache hierarchy
optimization; cache storage; CACTI simulation; chip
area; Chip Multiprocessor; chip multiprocessors; CMP;
Computational modeling; data sharing; Integrated
circuit modeling; Multiprocessing systems; network on
chip; network-on-chip; NoC; off-chip memory bandwidth;
optimisation; Optimization; power consumption; Resource
Allocation Optimization; Resource Allocation
Optimizations; Resource management",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Yavits:2014:CHO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yazdanshenas:2014:CLL,
author = "Sadegh Yazdanshenas and Marzieh Ranjbar Pirbasti and
Mahdi Fazeli and Ahmad Patooghy",
title = "Coding Last Level {STT-RAM} Cache For High Endurance
And Low Power",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "73--76",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "STT-RAM technology has recently emerged as one of the
most promising memory technologies. However, its major
problems, limited write endurance and high write
energy, are still preventing it from being used as a
drop-in replacement of SRAM cache. In this paper, we
propose a novel coding scheme for STT-RAM last level
cache based on the concept of value locality. We reduce
switching probability in cache by swapping common
patterns with limited weight codes (LWC) to make writes
less often as well as more uniform. We also define some
policies for swapping these patterns. Our evaluation
shows that bit write variance in memory cells can be
reduced by about 20\% on average resulting in a more
uniform wear-out directly enhancing lifetime and
improving cell reliability. In addition, writes in
cache lines can be reduced by about 12\% compared to
one of the most effective circuit level techniques
known as early write termination (EWT) [12]. Our method
increases memory hierarchy access time by about 0.08\%
on average, which is negligible. We have shown that our
method doesn't adversely affect last level cache
energy-delay(2). The non-uniformity caused by the
coding scheme can be used for another coding scheme at
main memory or L1 cache depending on their
technologies.",
acknowledgement = ack-nhfb,
affiliation = "Yazdanshenas, S (Reprint Author), Iran Univ Sci \&
Technol, Sch Comp Engn, Tehran, Iran. Yazdanshenas,
Sadegh; Pirbasti, Marzieh Ranjbar; Fazeli, Mahdi;
Patooghy, Ahmad, Iran Univ Sci \& Technol, Sch Comp
Engn, Tehran, Iran.",
author-email = "sadegh\_yazdanshenas@comp.iust.ac.ir
m\_ranjbar@comp.iust.ac.ir m\_fazeli@iust.ac.ir
patooghy@iust.ac.ir",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; bit write variance;
C Computer Systems Organization; C.1 Processor
Architectures; cache; cache storage; cell reliability;
circuit level technique; coding scheme; Computer
architecture; early write termination; Encoding;
limited weight code; limited weight codes; memory
endurance; memory technology; nonvolatile memory;
Nonvolatile memory; probability; Random access memory;
random-access storage; STT-RAM; STT-RAM cache;
switching probability; Three-dimensional displays;
write energy; write hotspot",
keywords-plus = "MEMORY; CIRCUIT; ENERGY; MRAM",
number-of-cited-references = "13",
ORCID-numbers = "Fazeli, Mahdi/0000-0002-2874-6256 Patooghy,
Ahmad/0000-0003-2647-2797",
research-areas = "Computer Science",
researcherid-numbers = "Fazeli/S-9574-2018",
times-cited = "14",
unique-id = "Yazdanshenas:2014:CLL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Martinsen:2014:HTL,
author = "Jan Kasper Martinsen and Hakan Grahn and Anders
Isberg",
title = "Heuristics for Thread-Level Speculation in {Web}
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "77--80",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2010.bib",
abstract = "JavaScript is a sequential programming language, and
Thread-Level Speculation has been proposed to
dynamically extract parallelism in order to take
advantage of parallel hardware. In previous work, we
have showed significant speed-ups with a simple on/off
speculation heuristic. In this paper, we propose and
evaluate three heuristics for dynamically adapt the
speculation: a 2-bit heuristic, an exponential
heuristic, and a combination of these two. Our results
show that the combined heuristic is able to both
increase the number of successful speculations and
decrease the execution time for 15 popular web
applications.",
acknowledgement = ack-nhfb,
affiliation = "Martinsen, JK (Reprint Author), Blekinge Inst Technol,
Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan
Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp,
SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony
Mobile Commun AB, SE-22188 Lund, Sweden.",
author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se
Anders.Isberg@sonymobile.com",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Industrial Excellence Center EASE -
Embedded Applications Software Engineering; BESQ+
research project --- Knowledge Foundation in Sweden
[20100311]",
funding-text = "This work was partly funded by the Industrial
Excellence Center EASE --- Embedded Applications
Software Engineering, (http://ease.cs.lth.se), and the
BESQ+ research project funded by the Knowledge
Foundation (grant number 20100311) in Sweden.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "2-bit heuristic; Automatic Parallelization; Benchmark
testing; C.1.4 Parallel Architectures; C.1.4.f
Speculative multi-threading; exponential heuristic;
Instruction sets; Internet; Java; JavaScript; Multicore
processors; Multithreading; Parallel Computing;
parallel hardware; Parallel processing; parallel
programming; sequential programming language; Social
network services; thread-level speculation; Web
applications",
number-of-cited-references = "12",
oa = "Green Published",
ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn,
Hakan/0000-0001-9947-1088",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Martinsen:2014:HTL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nandakumar:2014:OKS,
author = "Vivek S. Nandakumar and Ma{\l}gorzata Marek-Sadowska",
title = "On Optimal Kernel Size for Integrated {CPU--GPUs} ---
a Case Study",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "81--84",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Integrated CPU-GPU architectures with a fully
addressable shared memory completely eliminate any
CPU-GPU data transfer overhead. Since such
architectures are relatively new, it is unclear what
level of interaction between the CPU and GPU attains
the best energy efficiency. Too coarse grained or
larger kernels with fairly low CPU--GPU interaction
could cause poor utilization of the shared resources
while too fine grained kernels could cause frequent
interrupts of GPU computation and performance
degradation. Also larger kernels require larger shared
resources causing increase in area and parasitics which
affect the latency sensitive CPU cores. In this paper,
we show the effect of granularity on the overall
system's energy efficiency using a synthetic workload.
We describe how our framework models a truly unified
shared memory in integrated architectures with frequent
CPU--GPU communication.",
acknowledgement = ack-nhfb,
affiliation = "Nandakumar, VS (Reprint Author), Univ Calif Santa
Barbara, Dept Elect \& Comp Engn, Santa Barbara, CA
93106 USA. Nandakumar, Vivek S.; Marek-Sadowska,
Malgorzata, Univ Calif Santa Barbara, Dept Elect \&
Comp Engn, Santa Barbara, CA 93106 USA.",
author-email = "vivek@ece.ucsb.edu mms@ece.uscb.edu",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "SRC grant [2236]",
funding-text = "This work was supported by SRC grant \#2236.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B.3.2.g Shared memory; B.4.4.b Simulation; B.9.2
Energy-aware systems; C.1.3.f Heterogeneous (hybrid)
systems; C.4.g Measurement; Central Processing Unit;
Computational modeling; CPU-GPU communication; CPU-GPU
data transfer overhead; CPU-GPU interaction; D.4.4
Communications Management; energy efficiency; Energy
efficiency; evaluation; fine grained kernels; fully
addressable shared memory; GPU computation; graphics
processing units; Graphics processing units; integrated
CPU-GPU architectures; latency sensitive CPU cores;
Memory management; modeling; optimal kernel size;
overall system energy efficiency; performance
degradation; performance evaluation; power aware
computing; shared memory systems; simulation of
multiple-processor systems",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Nandakumar:2014:OKS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Liu:2014:PTE,
author = "Qixiao Liu and Victor Jimenez and Miquel Moreto and
Jaume Abella and Francisco J. Cazorla and Mateo
Valero",
title = "Per-task Energy Accounting in Computing Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "85--88",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present for the first time the concept of per-task
energy accounting (PTEA) and relate it to per-task
energy metering (PTEM). We show the benefits of
supporting both in future computing systems. Using the
shared last-level cache (LLC) as an example: (1) We
illustrate the complexities in providing PTEM and PTEA;
(2) we present an idealized PTEM model and an accurate
and low-cost implementation of it; and (3) we introduce
a hardware mechanism to provide accurate PTEA in the
cache.",
acknowledgement = ack-nhfb,
affiliation = "Liu, QX (Reprint Author), Univ Politecn Cataluna,
E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
Moreto, Miquel; Valero, Mateo, Univ Politecn Cataluna,
E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
Moreto, Miquel; Abella, Jaume; Cazorla, Francisco J.;
Valero, Mateo, Barcelona Supercomp Ctr, Barcelona,
Spain. Cazorla, Francisco J., Spanish Natl Res Council
IIIA CSIC, Barcelona, Spain.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Innovation
[TIN2012-34557]; HiPEAC Network of Excellence; Chinese
Scholarship Council [2010608015]",
funding-text = "This work has been partially supported by the Spanish
Ministry of Science and Innovation under grant
TIN2012-34557 and the HiPEAC Network of Excellence.
Qixiao Liu has also been funded by the Chinese
Scholarship Council under grant 2010608015.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; cache storage; Computational
modeling; computing systems; Energy consumption; Energy
management; Monitoring; Multicore processing; per-task
energy accounting; per-task energy metering; power
aware computing; PTEA; PTEM model; Radiation detectors;
shared last-level cache",
number-of-cited-references = "20",
oa = "Green Published",
ORCID-numbers = "Cazorla, Francisco/0000-0002-3344-376X Moreto Planas,
Miquel/0000-0002-9848-8758 Valero,
Mateo/0000-0003-2917-2482 Abella,
Jaume/0000-0001-7951-4028 Liu,
Qixiao/0000-0002-8196-7584",
research-areas = "Computer Science",
researcherid-numbers = "Cazorla, Francisco/D-7261-2016 Moreto Planas,
Miquel/C-1823-2016 Valero, Mateo/L-5709-2014 Abella,
Jaume/B-7422-2016",
times-cited = "2",
unique-id = "Liu:2014:PTE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mahmoodi:2014:RCC,
author = "Hamid Mahmoodi and Sridevi Srinivasan Lakshmipuram and
Manish Arora and Yashar Asgarieh and Houman Homayoun
and Bill Lin and Dean M. Tullsen",
title = "Resistive Computation: a Critique",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "89--92",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Resistive Computation was suggested by [6] as an idea
for tacking the power wall by replacing conventional
CMOS logic with Magnetic Tunnel Junction (MTJ) based
Look-Up Tables (LUTs). Spin Transfer Torque RAM
(STTRAM) is an emerging CMOS-compatible non-volatile
memory technology based on Magnetic Tunnel Junctions as
a memory bit [3]. The principal advantage of STTRAM is
that it is leakage-resistant, which is an important
characteristic beyond the 45nm technology node, where
leakage concerns are becoming a limiting factor in
microprocessor performance. Although STTRAM is a good
candidate for replacing SRAM for on-chip memory, we
argue in this article MTJ-based LUTs are unnecessarily
expensive in terms of area, power, and performance when
implementing fixed combinational logic that does not
require the reprogramming ability provided by MTJs.",
acknowledgement = ack-nhfb,
affiliation = "Mahmoodi, H (Reprint Author), San Francisco State
Univ, San Francisco, CA 94132 USA. Arora, Manish;
Asgarieh, Yashar; Lin, Bill; Tullsen, Dean M., Univ
Calif San Diego, La Jolla, CA 92093 USA. Mahmoodi,
Hamid; Lakshmipuram, Sridevi Srinivasan, San Francisco
State Univ, San Francisco, CA 94132 USA. Homayoun,
Houman, George Mason Univ, Fairfax, VA 22030 USA.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B.2.1 Design Styles; B.6.1.e Memory used as logic;
B.7.1.a Advanced technologies; B.9.1 Low-power design;
C.0.a Emerging technologies; CMOS integrated circuits;
CMOS-compatible nonvolatile memory technology; Delays;
dynamic current-mode logic; fixed combinational logic;
leakage power; leakage-resistance; Logic gates; look-up
tables; Low power electronics; magnetic tunnel
junction; Magnetic tunneling; magnetic tunnelling;
magnetic-tunnel junctions; memory bit; MRAM; MTJ-based
LUT; Power distribution; random-access storage;
Resistive computation; resistive computation; Resistive
computation; spin transfer torque RAM; STTRAM; Table
lookup; table lookup; Transistors",
keywords-plus = "TECHNOLOGY; CIRCUIT",
number-of-cited-references = "10",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009",
times-cited = "4",
unique-id = "Mahmoodi:2014:RCC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Eyerman:2014:RCW,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Restating the Case for Weighted-{IPC} Metrics to
Evaluate Multiprogram Workload Performance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "93--96",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Weighted speedup is nowadays the most commonly used
multiprogram workload performance metric. Weighted
speedup is a weighted-IPC metric, i.e., the
multiprogram IPC of each program is first weighted with
its isolated IPC. Recently, Michaud questions the
validity of weighted-IPC metrics by arguing that they
are inconsistent and that weighted speedup favors
unfairness [4]. Instead, he advocates using the
arithmetic or harmonic mean of the raw IPC values of
the programs in the multiprogram workload. We show that
weighted-IPC metrics are not inconsistent, and that
weighted speedup is fair in giving equal importance to
each program. We argue that, in contrast to raw-IPC
metrics, weighted-IPC metrics have a system-level
meaning, and that raw-IPC metrics are affected by the
inherent behavior of the programs. We also show that
the choice of a metric may adversely affect the
conclusions from an experiment. We suggest to use two
weighted-IPC metrics-system throughput (STP) and
average normalized turnaround time (ANTT)-for
evaluating multiprogram workload performance, and to
avoid raw-IPC metrics.",
acknowledgement = ack-nhfb,
affiliation = "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent,
Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent,
B-9000 Ghent, Belgium.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Research Foundation --- Flanders (FWO);
European Research Council under the European Community
[259295]",
funding-text = "Stijn Eyerman is supported through a postdoctoral
fellowship by the Research Foundation --- Flanders
(FWO). Additional support is provided by the European
Research Council under the European Community's Seventh
Framework Programme (FP7/2007-2013) / ERC Grant
agreement no. 259295.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ANTT; average normalized turnaround time; Benchmark
testing; C Computer Systems Organization; C.1 Processor
Architectures; C.1.3 Other Architecture Styles; C.1.3.h
Multithreaded processors; C.1.4 Parallel Architectures;
C.1.4.e Multi-core/single-chip multiprocessors; C.4
Performance of Systems; C.4.c Measurement techniques;
Degradation; Harmonic analysis; harmonic mean;
Multicore processing; multiprocessing systems;
multiprogram IPC; multiprogram workload performance
metric; multiprogramming; raw-IPC metrics; STP; system
throughput; system-level meaning; Throughput; Weight
measurement; weighted speedup; weighted-IPC metric",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Eyerman:2014:RCW",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wolff:2014:RUR,
author = "Sonya R. Wolff and Ronald D. Barnes",
title = "Revisiting Using the Results of Pre-Executed
Instructions in Runahead Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "97--100",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Long-latency cache accesses cause significant
performance-impacting delays for both in-order and
out-of-order processor systems. To address these
delays, runahead pre-execution has been shown to
produce speedups by warming-up cache structures during
stalls caused by long-latency memory accesses. While
improving cache related performance, basic runahead
approaches do not otherwise utilize results from
accurately pre-executed instructions during normal
operation. This simple model of execution is
potentially inefficient and performance constraining.
However, a previous study showed that exploiting the
results of accurately pre-executed runahead
instructions for out-of-order processors provide little
performance improvement over simple re-execution. This
work will show that, unlike out-of-order runahead
architectures, the performance improvement from
runahead result use for an in-order pipeline is more
significant, on average, and in some situations
provides dramatic performance improvements. For a set
of SPEC CPU2006 benchmarks which experience performance
improvement from basic runahead, the addition of result
use to the pipeline provided an additional speedup of
1.14X (high --- 1.48X) for an in-order processor model
compared to only 1.05X (high --- 1.16X) for an
out-of-order one. When considering benchmarks with poor
data cache locality, the average speedup increased to
1.21X for in-order compared to only 1.10X for
out-of-order.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; C.1.5.c Superscalar
dynamically-scheduled and statically-scheduled
implementation; C.1.5.e Memory hierarchy; cache
storage; data cache locality; Hidden Markov models;
in-order processor systems; long-latency cache
accesses; long-latency memory accesses; Memory Wall;
multiprocessing systems; Out of order; out-of-order
processor systems; out-of-order runahead architectures;
Pipeline processing; Pre-Execution; preexecuted
runahead instructions; Registers; Runahead; runahead
processors; SPEC CPU2006 benchmarks",
keywords-plus = "PIPELINES",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wolff:2014:RUR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2014:SGA,
author = "Youngsok Kim and Jaewon Lee and Donggyu Kim and
Jangwoo Kim",
title = "{ScaleGPU}: {GPU} Architecture for Memory-Unaware
{GPU} Programming",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Programmer-managed GPU memory is a major challenge in
writing GPU applications. Programmers must rewrite and
optimize an existing code for a different GPU memory
size for both portability and performance.
Alternatively, they can achieve only portability by
disabling GPU memory at the cost of significant
performance degradation. In this paper, we propose
ScaleGPU, a novel GPU architecture to enable
high-performance memory-unaware GPU programming.
ScaleGPU uses GPU memory as a cache of CPU memory to
provide programmers a view of CPU memory-sized
programming space. ScaleGPU also achieves high
performance by minimizing the amount of CPU-GPU data
transfers and by utilizing the GPU memory's high
bandwidth. Our experiments show that ScaleGPU can run a
GPU application on any GPU memory size and also
improves performance significantly. For example,
ScaleGPU improves the performance of the hotspot
application by similar to 48\% using the same size of
GPU memory and reduces its memory size requirement by
similar to 75\% maintaining the target performance.",
acknowledgement = ack-nhfb,
affiliation = "Kim, Y (Reprint Author), POSTECH, Dept Comp Sci \&
Engn, Pohang, South Korea. Kim, Youngsok; Lee, Jaewon;
Kim, Donggyu; Kim, Jangwoo, POSTECH, Dept Comp Sci \&
Engn, Pohang, South Korea.",
author-email = "elixir@postech.ac.kr spiegel0@postech.ac.kr
vteori@postech.ac.kr jangwoo@postech.ac.kr",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea (NRF) ---
Ministry of Education, Science and Technology
[2011-0014817]; NRF Grant --- Korean Government
(NRF-Global Ph.D. Fellowship Program)",
funding-text = "This research was supported by Basic Science Research
Program through the National Research Foundation of
Korea (NRF) funded by the Ministry of Education,
Science and Technology (2011-0014817) and NRF Grant
funded by the Korean Government (NRF-2012-Global Ph.D.
Fellowship Program).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C.1.2.j SIMD processors; C.1.4.e
Multi-core/single-chip multiprocessors; C.1.5.e Memory
hierarchy; cache; cache storage; code rewrite; CPU
memory-sized programming space; CPU-GPU data transfers;
Data transfer; GPU applications; GPU architecture; GPU
memory high bandwidth; GPU memory size; graphics
processing units; Graphics processing units; graphics
processing units; high-performance memory-unaware GPU
programming; I.3.1.a Graphics processors; Instruction
sets; memory architecture; Memory management; memory
size requirement; programmer-managed GPU memory;
Programming; Random access memory; ScaleGPU",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Kim:2014:SGA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sankar:2014:SFL,
author = "Sriram Sankar and Sudhanva Gurumurthi",
title = "Soft Failures in Large Datacenters",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A major problem in managing large-scale datacenters is
diagnosing and fixing machine failures. Most large
datacenter deployments have a management infrastructure
that can help diagnose failure causes, and manage
assets that were fixed as part of the repair process.
Previous studies identify only actual hardware
replacements to calculate Annualized Failure Rate (AFR)
and component reliability. In this paper, we show that
service availability is significantly affected by soft
failures and that this class of failures is becoming an
important issue at large datacenters with minimum human
intervention. Soft failures in the datacenter do not
require actual hardware replacements, but still result
in service downtime, and are equally important because
they disrupt normal service operation. We show failure
trends observed in a large datacenter deployment of
commodity servers and motivate the need to modify
conventional datacenter designs to help reduce soft
failures and increase service availability.",
acknowledgement = ack-nhfb,
affiliation = "Sankar, S (Reprint Author), Microsoft Corp, Redmond,
WA 98052 USA. Sankar, Sriram, Microsoft Corp, Redmond,
WA 98052 USA. Sankar, Sriram; Gurumurthi, Sudhanva,
Univ Virginia, Charlottesville, VA 22903 USA.
Gurumurthi, Sudhanva, Adv Micro Devices Inc, AMD Res,
Sunnyvale, CA 94088 USA.",
author-email = "sriram.sankar@microsoft.com
Sudhanva.Gurumurthi@amd.com",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AFR; annualized failure rate; asset management; C.4
Performance of Systems; C.5.5 Servers;
Characterization; Client-server systems; commodity
servers; component reliability; computer centres; Data
centers; Datacenter; datacenter deployments; datacenter
designs; datacenter management; failure cause
diagnosis; fault diagnosis; Hard disks; hardware
replacements; Large-scale systems; machine failure
diagnosis; machine failure fixing; Maintenance
engineering; Management; management infrastructure;
Market research; Reliability; repair process; service
availability; soft failures; Transient analysis",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Sankar:2014:SFL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2014:VPT,
author = "Daehoon Kim and Hwanju Kim and Jaehyuk Huh",
title = "{vCache}: Providing a Transparent View of the {LLC} in
Virtualized Environments",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "109--112",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Since most of the current multi-core processors use a
large last-level cache (LLC), efficient use of an LLC
is critical for the overall performance of multi-cores.
To improve the caching efficiency, page coloring is a
representative software-based approach to allow the OS
to control placement of pages on an LLC to improve
their cache utility and to avoid conflicts among cores.
However, system virtualization, with additional address
translation by the hypervisor, can make page coloring
techniques used by the guest OS ineffective, as guest
physical addresses used by the guest OS for coloring
differ from real addresses used for cache indexing in
the LLCs. In this paper, we propose a novel LLC
architecture to provide the guest OS with a flexible
control over LLC placement in virtualized systems. The
proposed vCache architecture can preserve coloring
information set by the guest OS. In addition to color
preservation, vCache can potentially eliminate the
traditional limitation of page coloring, the cost of
dynamic color changes for memory pages. By using the
pollute buffer mechanism, one of the color-based cache
optimization techniques, vCache shows performance
improvement of benchmark applications up to 33\%
without degrading the performance of another co-running
application in the VM.",
acknowledgement = ack-nhfb,
affiliation = "Kim, D (Reprint Author), Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon, South Korea. Kim,
Daehoon; Kim, Hwanju; Huh, Jaehyuk, Korea Adv Inst Sci
\& Technol, Dept Comp Sci, Taejon, South Korea.",
author-email = "daehoon@calab.kaist.ac.kr hjukim@calab.kaist.ac.kr
jhuh@calab.kaist.ac.kr",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "SW Computing R\&D Program of
KEIT(UX-oriented Mobile SW Platform) --- Ministry of
Trade, Industry, and Energy [2011-10041313]",
funding-text = "This research was supported by the SW Computing R\&D
Program of KEIT(2011-10041313, UX-oriented Mobile SW
Platform) funded by the Ministry of Trade, Industry,
and Energy.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address translation; B.3.2.b Cache memories; benchmark
applications; buffer mechanism; C.1.4.e
Multi-core/single-chip multiprocessors; C.1.5.e Memory
hierarchy; cache indexing; Cache partitioning; cache
storage; Cache storage; cache utility improvement;
caching efficiency improvement; co-running application;
color-based cache optimization techniques; coloring
information preservation; core conflict avoidance;
dynamic color cost; guest OS; guest physical address;
hypervisor; last-level cache; LLC architecture; LLC
placement; Memory management; memory pages; Multicore
processing; multicore processor performance;
multiprocessing systems; operating systems (computers);
Page coloring; page coloring; page placement control;
paged storage; software-based approach; system
virtualization; transparent LLC view; vCache
architecture; Virtual machine monitors; virtual
machines; virtualisation; Virtualization; virtualized
environments; VM",
number-of-cited-references = "8",
research-areas = "Computer Science",
researcherid-numbers = "Huh, Jaehyuk/C-1716-2011",
times-cited = "2",
unique-id = "Kim:2014:VPT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2014:TCb,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C1--C1",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368891",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICAa,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}
Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C2--C2",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368892",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICAb,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}}
Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C3--C3",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368893",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICSb,
author = "Anonymous",
title = "{IEEE Computer Society} [advertisement]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C4--C4",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368894",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Liao:2015:AWL,
author = "Jianwei Liao and Fengxiang Zhang and Li Li and
Guoqiang Xiao",
title = "Adaptive Wear-Leveling in Flash-Based Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329871",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The paper presents an adaptive wear-leveling scheme
based on several wear-thresholds in different periods.
The basic idea behind this scheme is that blocks can
have different wear-out speeds and the wear-leveling
mechanism does not conduct data migration until the
erasure counts of some hot blocks hit a threshold.
Through a series of emulation experiments based on
several realistic disk traces, we show that the
proposed wear-leveling mechanism can reduce total
erasure counts and yield uniform erasure counts among
all blocks at the late lifetime of the storage devices.
As a result, not only can the performance of storage
systems be advanced, the lifespan of the flash-based
memory can also be extended to a certain degree.",
acknowledgement = ack-nhfb,
affiliation = "Liao, JW (Reprint Author), Southwest Univ, Coll Comp
\& Informat Sci, Chongqing, Peoples R China. Liao,
Jianwei; Zhang, Fengxiang; Li, Li; Xiao, Guoqiang,
Southwest Univ, Coll Comp \& Informat Sci, Chongqing,
Peoples R China.",
author-email = "liaojianwei@il.is.s.u-okyo.ac.jp zhangfx@swu.edu.cn
lily@swu.edu.cn gqxiao@swu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adaptive systems; adaptive wear-leveling; Ash;
Benchmark testing; data migration; delayed migration;
disk traces; emulation experiments; Equations; erasure
evenness; extending lifetime; flash memories;
flash-based memory; Flash-based storage devices; Market
research; Servers; Standards; total erasure count
reduction; wear; wear-leveling; wear-leveling
mechanism; wear-out speeds; wear-thresholds",
number-of-cited-references = "11",
ORCID-numbers = "Liao, Jianwei/0000-0001-6149-6650",
research-areas = "Computer Science",
researcherid-numbers = "Liao, Jianwei/C-5339-2016",
times-cited = "4",
unique-id = "Liao:2015:AWL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2015:IIC,
author = "Anonymous",
title = "2014 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 13",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "1--5",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2387774",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Chen:2015:HSC,
author = "Jie Chen and Guru Venkataramani",
title = "A Hardware-Software Cooperative Approach for
Application Energy Profiling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2323711",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Energy consumption by software applications is a
critical issue that determines the future of multicore
software development. In this article, we propose a
hardware-software cooperative approach that uses
hardware support to efficiently gather the
energy-related hardware counters during program
execution, and utilizes parameter estimation models in
software to compute the energy consumption by
instructions at a finer grain level (say basic block).
We design mechanisms to minimize collinearity in
profiler data, and present results to validate our
energy estimation methodology.",
acknowledgement = ack-nhfb,
affiliation = "Chen, J (Reprint Author), George Washington Univ, Dept
Elect \& Comp Engn, Washington, DC 20052 USA. Chen,
Jie; Venkataramani, Guru, George Washington Univ, Dept
Elect \& Comp Engn, Washington, DC 20052 USA.",
author-email = "jiec@gwu.edu guruv@gwu.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application energy profiling; Benchmark testing;
Energy consumption; energy consumption; energy
debugging; energy estimation; energy estimation
methodology; Energy profiling; energy-related hardware
counters; Estimation; Hardware; hardware-software
codesign; hardware-software cooperative approach;
Mathematical model; multicore software development;
multiprocessing systems; Parameter estimation;
parameter estimation models; power aware computing;
profiler data collinearity; program execution;
Software; software applications",
keywords-plus = "POWER",
number-of-cited-references = "12",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Chen:2015:HSC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2015:ASM,
author = "Dae-Hyun Kim and Prashant J. Nair and Moinuddin K.
Qureshi",
title = "Architectural Support for Mitigating Row Hammering in
{DRAM} Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2332177",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DRAM scaling has been the prime driver of increasing
capacity of main memory systems. Unfortunately, lower
technology nodes worsen the cell reliability as it
increases the coupling between adjacent DRAM cells,
thereby exacerbating different failure modes. This
paper investigates the reliability problem due to Row
Hammering, whereby frequent activations of a given row
can cause data loss for its neighboring rows. As DRAM
scales to lower technology nodes, the threshold for the
number of row activations that causes data loss for the
neighboring rows reduces, making Row Hammering a
challenging problem for future DRAM chips. To overcome
Row Hammering, we propose two architectural solutions:
First, Counter-Based Row Activation (CRA), which uses a
counter with each row to count the number of row
activations. If the count exceeds the row hammering
threshold, a dummy activation is sent to neighboring
rows proactively to refresh the data. Second,
Probabilistic Row Activation (PRA), which obviates
storage overhead of tracking and simply allows the
memory controller to proactively issue dummy
activations to neighboring rows with a small
probability for all memory access. Our evaluations show
that these solutions are effective at mitigating Row
hammering while causing negligible performance loss (<
1 percent).",
acknowledgement = ack-nhfb,
affiliation = "Kim, DH (Reprint Author), Georgia Inst Technol, Dept
ECE, Atlanta, GA 30363 USA. Kim, Dae-Hyun; Nair,
Prashant J.; Qureshi, Moinuddin K., Georgia Inst
Technol, Dept ECE, Atlanta, GA 30363 USA.",
author-email = "dhkim@ece.gatech.edu pnair6@ece.gatech.edu
moin@ece.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural support; cell reliability; Computer
architecture; counter-based row activation; data
errors; data retention; DRAM chips; DRAM memories; DRAM
scaling; Dynamic random access memory; Dynamic random
access memory, row hammering, data retention, data
errors; Leakage currents; Logic gates; Microprocessors;
probabilistic row activation; probability; Radiation
detectors; Random access memory; reliability;
reliability problem; row hammering; Transistors",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "23",
unique-id = "Kim:2015:ASM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nathan:2015:AGC,
author = "Ralph Nathan and Daniel J. Sorin",
title = "{Argus-G}: Comprehensive, Low-Cost Error Detection for
{GPGPU} Cores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2298391",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We have developed and evaluated Argus-G, an error
detection scheme for general purpose GPU (GPGPU) cores.
Argus-G is a natural extension of the Argus error
detection scheme for CPU cores, and we demonstrate how
to modify Argus such that it is compatible with GPGPU
cores. Using an RTL prototype, we experimentally show
that Argus-G can detect the vast majority of injected
errors at relatively low performance, area, and power
costs.",
acknowledgement = ack-nhfb,
affiliation = "Nathan, R (Reprint Author), Duke Univ, Durham, NC
27708 USA. Nathan, Ralph; Sorin, Daniel J., Duke Univ,
Durham, NC 27708 USA.",
author-email = "ralph.nathan@duke.edu sorin@ee.duke.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Argus-G; Benchmark testing; Conferences; CPU cores;
error detection; fault tolerance; general purpose GPU
cores; GPGPU cores; Graphics processing units; graphics
processing units; Graphics processors; Hardware;
Hardware design languages; Instruction sets; low-cost
error detection; Registers",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Nathan:2015:AGC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{O:2015:CCI,
author = "Seongil O and Sanghyuk Kwon and Young Hoon Son and
Yujin Park and Jung Ho Ahn",
title = "{CIDR}: a Cache Inspired Area-Efficient {DRAM}
Resilience Architecture against Permanent Faults",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2324894",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "area overhead; area-efficient DRAM resilience
architecture; Arrays; augmented cache; bit errors;
Bloom filter; cache data array; cache storage; cache
tags; cache-inspired DRAM resilience architecture;
CIDR; Circuit faults; cost-sensitive main-memory DRAM
devices; data structures; Decoding; device failure
rates; DRAM arrays; DRAM chips; DRAM, error resilience,
permanent faults, row and column sparing, Bloom filter,
DRAM-side caching; energy overhead minimization; error
statistics; fault diagnosis; faulty cells; I/O pads;
memory architecture; permanent faults; processor-memory
interfaces; Random access memory; Resilience;
single-bit error rates; Testing; testing phase",
}
@Article{Seongil:2015:CCI,
author = "O. Seongil and Sanghyuk Kwon and Young Hoon Son and
Yujin Park and Jung Ho Ahn",
title = "{CIDR}: a Cache Inspired Area-Efficient {DRAM}
Resilience Architecture against Permanent Faults",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2324894",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Faulty cells have become major problems in
cost-sensitive main-memory DRAM devices. Conventional
solutions to reduce device failure rates due to cells
with permanent faults, such as populating spare rows
and relying on error-correcting codes, have had limited
success due to high area overheads. In this paper, we
propose CIDR, a novel cache-inspired DRAM resilience
architecture, which substantially reduces the area
overhead of handling bit errors from these faulty
cells. A DRAM device adopting CIDR has a small cache
next to its I/O pads to replace accesses to the
addresses that include the faulty cells with ones that
correspond to the cache data array. We minimize the
energy overhead of accessing the cache tags for every
read or write by adding a Bloom filter in front of the
cache. The augmented cache is programmed once during
the testing phase and is out of the critical path on
normal accesses because both cache and DRAM arrays are
accessed in parallel, making CIDR transparent to
existing processor-memory interfaces. Compared to the
conventional architecture relying on spare rows, CIDR
lowers the area overhead of achieving equal failure
rates over a wide range of single-bit error rates, such
as 23.6 x lower area overhead for a bit-error rate of
10(-5) and a device failure rate of 10(-3).",
acknowledgement = ack-nhfb,
affiliation = "Seongil, O (Reprint Author), Seoul Natl Univ, Dept
Transdisciplinary Studies, Seoul, South Korea. Seongil,
O.; Kwon, Sanghyuk; Son, Young Hoon; Park, Yujin; Ahn,
Jung Ho, Seoul Natl Univ, Dept Transdisciplinary
Studies, Seoul, South Korea.",
author-email = "swdfish@snu.ac.kr kkwon114@snu.ac.kr yhson96@snu.ac.kr
comesay@snu.ac.kr gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bloom filter; DRAM; DRAM-side caching; error
resilience; permanent faults; row and column sparing",
number-of-cited-references = "13",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Seongil:2015:CCI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gupta:2015:CEO,
author = "Ujjwal Gupta and Umit Y. Ogras",
title = "Constrained Energy Optimization in Heterogeneous
Platforms Using Generalized Scaling Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "21--25",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2326603",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Platform energy consumption and responsiveness are two
major considerations for mobile systems since they
determine the battery life and user satisfaction,
respectively. We first present models for power
consumption, response time and energy consumption of
heterogeneous mobile platforms. Then, we use these
models to optimize the energy consumption of baseline
platforms under response time and temperature
constraints with and without introducing new resources.
We show that the optimal design choices depend on
dynamic power management algorithm, and adding new
resources is more energy efficient than scaling
existing resources alone.",
acknowledgement = ack-nhfb,
affiliation = "Gupta, U (Reprint Author), Arizona State Univ, Sch
Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta,
Ujjwal; Ogras, Umit Y., Arizona State Univ, Sch Elect
Comp \& Energy Engn, Tempe, AZ 85281 USA.",
author-email = "ujjwal@asu.edu umit@asu.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "battery life determine; Computers; constrained energy
optimization; dynamic power management algorithm;
Energy consumption; Energy optimization; generalized
scaling models; heterogeneous architectures;
heterogeneous mobile platforms; Mobile communication;
mobile computing; mobile platforms; mobile systems;
MpSoC; Multicore processing; Optimization; performance;
platform energy consumption; power aware computing;
power consumption; Power demand; response time;
temperature constraints; Time factors; user
satisfaction",
keywords-plus = "AMDAHLS LAW; MULTIAMDAHL; ACCELERATOR; MANAGEMENT;
CPU; ERA",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Gupta:2015:CEO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Farmahini-Farahani:2015:DAA,
author = "Amin Farmahini-Farahani and Jung Ho Ahn and Katherine
Morrow and Nam Sung Kim",
title = "{DRAMA}: an Architecture for Accelerated Processing
Near Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "26--29",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2333735",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Improving energy efficiency is crucial for both mobile
and high-performance computing systems while a large
fraction of total energy is consumed to transfer data
between storage and processing units. Thus, reducing
data transfers across the memory hierarchy of a
processor (i.e., off-chip memory, on-chip caches, and
register file) can greatly improve the energy
efficiency. To this end, we propose an architecture,
DRAMA, that 3D-stacks coarse-grain reconfigurable
accelerators (CGRAs) atop off-chip DRAM devices. DRAMA
does not require changes to the DRAM device
architecture, apart from through-silicon vias (TSVs)
that connect the DRAM device's internal I/O bus to the
CGRA layer. We demonstrate that DRAMA can reduce the
energy consumption to transfer data across the memory
hierarchy by 66-95 percent while achieving speedups of
up to 18 x over a commodity processor.",
acknowledgement = ack-nhfb,
affiliation = "Farmahini-Farahani, A (Reprint Author), Univ
Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr,
Madison, WI 53706 USA. Farmahini-Farahani, Amin;
Morrow, Katherine; Kim, Nam Sung, Univ Wisconsin, Dept
Elect \& Comp Engn, Madison, WI 53706 USA. Ahn, Jung
Ho, Seoul Natl Univ, Dept Transdisciplinary Studies,
Seoul 151742, South Korea.",
author-email = "farmahinifar@wisc.edu gajh@snu.ac.kr
kati@engr.wisc.edu nskim3@wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D-stacking; 3D-stacks coarse-grain reconfigurable
accelerators; accelerated near memory processing;
Acceleration; accelerator; Arrays; data transfers;
DRAM; DRAM chips; DRAM devices; DRAMA architecture;
dynamic random access memory; energy conservation;
energy consumption reduction; energy efficiency;
energy-efficient computing; high-performance computing
systems; Kernel; memory hierarchy; Memory management;
mobile computing systems; Near memory processing; Near
memory processing, DRAM, 3D-stacking, energy-efficient
computing, accelerator; processing units; Random access
memory; Registers; storage management; storage units;
through-silicon vias; total energy fraction; TSV",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Farmahini-Farahani:2015:DAA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Carlson:2015:EPM,
author = "Trevor E. Carlson and Siddharth Nilakantan and Mark
Hempstead and Wim Heirman",
title = "Epoch Profiles: Microarchitecture-Based Application
Analysis and Optimization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "30--33",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329873",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The performance of data-intensive applications, when
running on modern multi-and many-core processors, is
largely determined by their memory access behavior. Its
most important contributors are the frequency and
latency of off-chip accesses and the extent to which
long-latency memory accesses can be overlapped with
useful computation or with each other. In this paper we
present two methods to better understand application
and microarchitectural interactions. An epoch profile
is an intuitive way to understand the relationships
between three important characteristics: the on-chip
cache size, the size of the reorder window of an
out-of-order processor, and the frequency of processor
stalls caused by long-latency, off-chip requests
(epochs). By relating these three quantities one can
more easily understand an application's memory
reference behavior and thus significantly reduce the
design space. While epoch profiles help to provide
insight into the behavior of a single application,
developing an understanding of a number of applications
in the presence of area and core count constraints
presents additional challenges. Epoch-based
microarchitectural analysis is presented as a better
way to understand the trade-offs for memory-bound
applications in the presence of these physical
constraints. Through epoch profiling and optimization,
one can significantly reduce the multidimensional
design space for hardware/software optimization through
the use of high-level model-driven techniques.",
acknowledgement = ack-nhfb,
affiliation = "Carlson, TE (Reprint Author), Univ Ghent, Sint
Pietersnieuwstr 41, B-9000 Ghent, East Flanders,
Belgium. Carlson, Trevor E., Univ Ghent, B-9000 Ghent,
East Flanders, Belgium. Nilakantan, Siddharth;
Hempstead, Mark, Drexel Univ, Dept Elect \& Comp Engn,
Bossone Res Ctr, Philadelphia, PA 19104 USA. Heirman,
Wim, Intel Corp, Leuven, Flemish Brabant, Belgium.",
author-email = "trevor.carlson@elis.ugent.be sn446@drexel.edu
mhempstead@drexel.edu wim.heirman@intel.com",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computational modeling; Frequency
measurement; memory-level parallelism;
Microarchitecture; Microarchitecture analysis; Out of
order; System-on-chip; visualization",
number-of-cited-references = "6",
oa = "Green Published",
ORCID-numbers = "Carlson, Trevor/0000-0001-8742-134X Nilakantan,
Siddharth/0000-0003-1067-700X Heirman,
Wim/0000-0003-2286-1525",
research-areas = "Computer Science",
researcherid-numbers = "Carlson, Trevor/M-4945-2016",
times-cited = "0",
unique-id = "Carlson:2015:EPM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Power:2015:GGH,
author = "Jason Power and Joel Hestness and Marc S. Orr and Mark
D. Hill and David A. Wood",
title = "{gem5-gpu}: a Heterogeneous {CPU--GPU} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "34--36",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2299539",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/pvm.bib",
abstract = "gem5-gpu is a new simulator that models tightly
integrated CPU-GPU systems. It builds on gem5, a
modular full-system CPU simulator, and GPGPU-Sim, a
detailed GPGPU simulator. gem5-gpu routes most memory
accesses through Ruby, which is a highly configurable
memory system in gem5. By doing this, it is able to
simulate many system configurations, ranging from a
system with coherent caches and a single virtual
address space across the CPU and GPU to a system that
maintains separate GPU and CPU physical address spaces.
gem5-gpu can run most unmodified CUDA 3.2 source code.
Applications can launch non-blocking kernels, allowing
the CPU and GPU to execute simultaneously. We present
gem5-gpu's software architecture and a brief
performance validation. We also discuss possible
extensions to the simulator. gem5-gpu is open source
and available at gem5-gpu.cs.wisc.edu.",
acknowledgement = ack-nhfb,
affiliation = "Power, J (Reprint Author), Univ Wisconsin, Dept Comp
Sci, 1210 W Dayton St, Madison, WI 53706 USA. Power,
Jason; Hestness, Joel; Orr, Marc S.; Hill, Mark D.;
Wood, David A., Univ Wisconsin, Dept Comp Sci, Madison,
WI 53706 USA.",
author-email = "powerjg@cs.wisc.edu hestness@cs.wisc.edu
morr@cs.wisc.edu markhill@cs.wisc.edu
david@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Coherence; Computational modeling; Computer
architecture; computer architecture; gem5-gpu
simulator; general-purpose graphics processors;
GPGPUSim; Graphics processing units; graphics
processing units; heterogeneous (hybrid) systems;
heterogeneous CPU-GPU simulator; Kernel; Modeling
techniques; modular full-system CPU simulator;
nonblocking kernels; Object oriented modeling;
Protocols; simulators; software architecture",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "62",
unique-id = "Power:2015:GGH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manatunga:2015:HSS,
author = "Dilan Manatunga and Joo Hwan Lee and Hyesoon Kim",
title = "Hardware Support for Safe Execution of Native Client
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2309601",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Over the past few years, there has been vast growth in
the area of the web browser as an applications
platform. One example of this trend is Google's Native
Client (NaCl) platform, which is a software-fault
isolation mechanism that allows the running of native
x86 or ARM code on the browser. One of the security
mechanisms employed by NaCl is that all branches must
jump to the start of a valid instruction. In order to
achieve this criteria though, all return instructions
are replaced by a specific branch instruction sequence,
which we call NaCl returns, that are guaranteed to
return to a valid instruction. However, these NaCl
returns lose the advantage of the highly accurate
return-address stack (RAS) in exchange for the less
accurate indirect branch predictor. In this paper, we
propose a NaCl-RAS mechanism that can identify and
accurately predict 76.9 on average compared to the 39.5
of a traditional BTB predictor.",
acknowledgement = ack-nhfb,
affiliation = "Manatunga, D (Reprint Author), Georgia Inst Technol,
Sch Comp Sci, Atlanta, GA 30332 USA. Manatunga, Dilan;
Lee, Joo Hwan; Kim, Hyesoon, Georgia Inst Technol, Sch
Comp Sci, Atlanta, GA 30332 USA.",
author-email = "dmanatunga@gatech.edu joohwan.lee@gatech.edu
hyesoon@cc.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; ARM code; Benchmark testing; branch
instruction sequence; branch prediction accuracy; BTB
predictor; Detectors; fault diagnosis; Google;
Hardware; hardware support; NaCl-RAS mechanism; Native
client; native client applications; native x86; online
front-ends; return address prediction; return-address
stack; safe execution; Security; security mechanism;
security of data; Software; software fault isolation;
software-fault isolation mechanism; Web browser",
keywords-plus = "SANDBOX; CODE",
number-of-cited-references = "5",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Manatunga:2015:HSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Liu:2015:LHP,
author = "Longjun Liu and Chao Li and Hongbin Sun and Yang Hu
and Jingmin Xin and Nanning Zheng and Tao Li",
title = "Leveraging Heterogeneous Power for Improving
Datacenter Efficiency and Resiliency",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "41--45",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2363084",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power mismatching between supply and demand has
emerged as a top issue in modern datacenters that are
under-provisioned or powered by intermittent power
supplies. Recent proposals are primarily limited to
leveraging uninterruptible power supplies (UPS) to
handle power mismatching, and therefore lack the
capability of efficiently handling the irregular peak
power mismatches. In this paper we propose hPower, the
first heterogeneous energy buffering strategy that
incorporates supercapacitors into existing datacenters
to handle power mismatch. Our technique exploits power
supply diversity and smart load assignment to provide
efficiency-aware and emergency-aware power mismatch
management. We show that hPower could improve energy
efficiency by 30 percent, extend UPS lifetime by 4.3 x,
and reduce system downtime by 36 percent. It allows
datacenters to adapt themselves to various power supply
anomalies, thereby improving operational efficiency and
resiliency.",
acknowledgement = ack-nhfb,
affiliation = "Liu, LJ (Reprint Author), Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Liu, Longjun; Sun, Hongbin; Xin, Jingmin; Zheng,
Nanning, Xi An Jiao Tong Univ, Sch Elect \& Informat
Engn, Xian 710049, Peoples R China. Li, Chao, Shanghai
Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200030,
Peoples R China. Hu, Yang; Li, Tao, Univ Florida, Dept
Elect \& Comp Engn, Gainesville, FL USA.",
author-email = "longjun.liu@stu.xjtu.edu.cn lichao@cs.sjtu.edu.cn
hsun@mail.xjtu.edu.cn huyang.ece@ufl.edu
jxin@mail.xjtu.edu.cn nnzheng@mail.xjtu.edu.cn
taoli@ece.ufl.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Batteries; computer centres; computer system
implementation; Computer System Implementation;
computer system implementation; data center efficiency;
data center resiliency; efficiency-aware power mismatch
management; emergency-aware power mismatch management;
energy conservation; Energy efficiency; Energy-aware
systems; Energy-Aware Systems; heterogeneous energy
buffering strategy; heterogeneous power; hPower;
performance of systems; Performance of Systems; power
aware computing; Power demand; power mismatching; power
supply anomalies; power supply diversity; Servers;
smart load assignment; Supercapacitors;
supercapacitors; system downtime reduction;
uninterruptible power supplies; Uninterruptible power
systems; UPS",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Liu:2015:LHP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2015:LNV,
author = "Rui Wang and Wangyuan Zhang and Tao Li and Depei
Qian",
title = "Leveraging Non-Volatile Storage to Achieve Versatile
Cache Optimizations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "46--49",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2298412",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The efficiency of caches plays a vital role in
microprocessor. In this paper, we introduce a novel and
flexible cache substrate that employs non-volatile yet
versatile SRAM (NV2-SRAM) cell design, which
synergistically integrates new memory devices into the
standard SRAM cells. Our experiments show that it can
achieve a 67 percent energy saving and 3: 1 x
reliability improvement over the SRAM based cache,
outperforming the drowsy cache design in terms of both
power efficiency and reliability. Moreover, the
proposed cache architecture can be used to improve the
performance of prefetching schemes by 10 percent.",
acknowledgement = ack-nhfb,
affiliation = "Wang, R (Reprint Author), Beihang Univ, Sch Comp Sci
\& Engn, State Key Lab Software Dev Environm, Beijing
100191, Peoples R China. Wang, Rui; Qian, Depei,
Beihang Univ, Sch Comp Sci \& Engn, State Key Lab
Software Dev Environm, Beijing 100191, Peoples R China.
Zhang, Wangyuan; Li, Tao, Univ Florida, ECE Dept,
Gainesville, FL 32611 USA.",
author-email = "rui.wang@jsi.buaa.edu.cn zhangwangyuan@gmail.com
taoli@ece.ufl.edu depeiq@buaa.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache architecture; Cache memories; cache storage;
Computer architecture; energy saving; flexible cache
substrate; low-power design; Magnetic tunneling; memory
structures; microprocessor; Microprocessors;
Nonvolatile memory; nonvolatile storage; nonvolatile
yet versatile SRAM cell design; NV2-SRAM cell design;
Prefetching; prefetching schemes; reliability
improvement; SRAM; SRAM based cache; SRAM cells; SRAM
chips; storage management; versatile cache
optimizations",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wang:2015:LNV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mohammadi:2015:DDB,
author = "Milad Mohammadi and Song Han and Tor M. Aamodt and
William J. Dally",
title = "On-Demand Dynamic Branch Prediction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "50--53",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2330820",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In out-of-order (OoO) processors, speculative
execution with high branch prediction accuracy is
employed to achieve good single thread performance. In
these processors the branch prediction unit tables
(BPU) are accessed in parallel with the instruction
cache before it is known whether a fetch group contains
branch instructions. For integer applications, we find
85 percent of BPU lookups are done for non-branch
operations and of the remaining lookups, 42 percent are
done for highly biased branches that can be predicted
statically with high accuracy. We evaluate on-demand
branch prediction (ODBP), a novel technique that uses
compiler generated hints to identify those instructions
that can be more accurately predicted statically to
eliminate unnecessary BPU lookups. We evaluate an
implementation of ODBP that combines static and dynamic
branch prediction. For a four wide superscalar
processor, ODBP delivers as much as 9 percent
improvement in average energy-delay (ED) product, 7
percent core average energy saving, and 3 percent
speedup. ODBP also enables the use of large BPU's for a
given power budget.",
acknowledgement = ack-nhfb,
affiliation = "Mohammadi, M (Reprint Author), Stanford Univ, Dept
Elect Engn, Stanford, CA 94305 USA. Mohammadi, Milad;
Han, Song; Dally, William J., Stanford Univ, Dept Elect
Engn, Stanford, CA 94305 USA. Aamodt, Tor M., Univ
British Columbia, Dept Elect \& Comp Engn, Vancouver,
BC V6T 1Z4, Canada.",
author-email = "milad@stanford.edu songhan@stanford.edu
aamodt@ece.ubc.ca dally@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; ahead prediction; BPU lookup; branch
instruction; branch prediction accuracy; branch
prediction unit table; cache storage; compiler
generated hints; Computer architecture; core average
energy saving; ED product; Energy efficiency;
energy-delay product; energy-delay product
optimization; Equations; instruction cache; instruction
sets; Mathematical model; nonbranch operation; ODBP;
on-demand branch prediction; on-demand dynamic branch
prediction; OoO processor; out-of-order processor;
parallel processing; Pipelines; power budget; program
compilers; Program processors; single thread
performance; speculative execution; static and dynamic
branch prediction hybrid; static branch prediction;
superscalar processor; table lookup; Tin",
keywords-plus = "MICROPROCESSOR; DESIGN",
number-of-cited-references = "27",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Mohammadi:2015:DDB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Azriel:2015:PMT,
author = "Leonid Azriel and Avi Mendelson and Uri Weiser",
title = "Peripheral Memory: a Technique for Fighting Memory
Bandwidth Bottleneck",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "54--57",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2319077",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory bottleneck has always been a major cause for
limiting the performance of computer systems. While in
the past latency was the major concern, today, lack of
bandwidth becomes a limiting factor as well, as a
result of exploiting more parallelism with the growing
number of cores per die, which intensifies the pressure
on the memory bus. In such an environment, any
additional traffic to memory, such as the I/O traffic
may lead to degradation of the overall performance of
the system. This work introduces the concept of
Peripheral Memory, a software controlled memory that
resides in the I/O domain and can be used for
offloading I/O traffic from CPU memory. The Peripheral
Memory handles `I/O exclusive data', data originated
and terminated at I/O domain, and which does not need
any processing by the CPU. The paper analyses the
impact of I/O traffic on the overall performance of the
current systems and demonstrates that in numerous
applications, I/O exclusive data occupies major part of
memory bandwidth, as a result, degrading CPU processing
performance and increasing power. The paper considers
four different implementations of the Peripheral
Memory: pageable, pinned, non-coherent split-traffic
and copy-on-access. Our full-system simulator indicates
that non-coherent split traffic configuration is the
most efficient implementation, which can provide up to
four times speedup in the I/O processing rate for
typical I/O intensive applications. In addition, based
on Power model and measurements tools, the paper
demonstrates that the Peripheral Memory in a server
system can lead to reduction of tens of Watts in the
overall system power consumption or 10-20 percent of
the system power budget.",
acknowledgement = ack-nhfb,
affiliation = "Azriel, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Azriel, Leonid; Mendelson, Avi; Weiser, Uri, Technion
Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
Israel.",
author-email = "leonida@tx.technion.ac.il
avi.mendelson@tce.technion.ac.il
uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; bandwidth allocation; Benchmark testing;
computer system performance; CPU memory; full-system
simulator; I/O domain; I/O traffic offloading;
input/output devices; Instruction sets; interconnection
architectures; main memory; memory bandwidth
bottleneck; memory bus; Memory management; parallelism;
performance evaluation; Performance evaluation;
peripheral memory; Power demand; Power measurement;
server system; software controlled memory; storage
management; system buses",
keywords-plus = "NETWORK; I/O",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Azriel:2015:PMT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2015:PTM,
author = "Zhaoguo Wang and Han Yi and Ran Liu and Mingkai Dong
and Haibo Chen",
title = "Persistent Transactional Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "58--61",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329832",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes persistent transactional memory
(PTM), a new design that adds durability to
transactional memory (TM) by incorporating with the
emerging non-volatile memory (NVM). PTM dynamically
tracks transactional updates to cache lines to ensure
the ACI (atomicity, consistency and isolation)
properties during cache flushes and leverages an undo
log in NVM to ensure PTM can always consistently
recover transactional data structures from a machine
crash. This paper describes the PTM design based on
Intel's restricted transactional memory. A preliminary
evaluation using a concurrent key/value store and a
database with a cache-based simulator shows that the
additional cache line flushes are small.",
acknowledgement = ack-nhfb,
affiliation = "Wang, ZG (Reprint Author), Shanghai Jiao Tong Univ,
Shanghai Key Lab Scalable Comp \& Syst, Shanghai
200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
Univ, Shanghai Key Lab Scalable Comp \& Syst, Shanghai
200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
Univ, Inst Parallel \& Distributed Syst, Shanghai
200030, Peoples R China.",
author-email = "tigerwang1986@gmail.com ken.yihan1990@gmail.com
naruilone@gmail.com mingkaidong@gmail.com
haibochen@sjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACI properties; Batteries; cache line flushes; cache
storage; cache-based simulator; Computer crashes; Data
structures; Databases; Hardware; Hardware transactional
memory; non-volatile random access memory; Nonvolatile
memory; nonvolatile memory; NVM; persistent
transactional memory; PTM design; Registers",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Wang:2015:PTM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gibert:2015:PSR,
author = "Enric Gibert and Raul Mart{\'\i}nez and Carlos
Madriles and Josep M. Codina",
title = "Profiling Support for Runtime Managed Code: Next
Generation Performance Monitoring Units",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "62--65",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2321398",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Given the increase of runtime managed code
environments in desktop, server, and mobile segments,
agile, flexible, and accurate performance monitoring
capabilities are required in order to perform wise code
transformations and optimizations. Common profiling
strategies, mainly based on instrumentation and current
performance monitoring units (PMUs), are not adequate
and new innovative designs are necessary. In this
paper, we present the desired characteristics of what
we call next generation PMUs and advocate for
hardware/software collaborative approaches where
hardware implements the profiling hooks and mechanisms
and software implements the complex heuristics. We then
propose a first design in which the hardware uses a
small, yet flexible table to profile specific code
regions and the software decides what/when/how to
profile. This first design meets all required features
and we aim it as the seed for future PMUs extensions to
enable novel dynamic code transformations and
optimizations.",
acknowledgement = ack-nhfb,
affiliation = "Gibert, E (Reprint Author), Intel Corp, Intel Labs,
Intel Barcelona Res Ctr IBRC, Edifici Nexus 2, Planta
0-D, Jordi Girona 29, Barcelona, Spain. Gibert, Enric;
Martinez, Raul; Madriles, Carlos; Codina, Josep M.,
Intel Corp, Intel Labs, Intel Barcelona Res Ctr IBRC,
Barcelona, Spain.",
author-email = "enric.gibert.codina@intel.com raul.martinez@intel.com
carlos.madriles.gimeno@intel.com
josep.m.codina@intel.com",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "dynamic code optimizations; dynamic code
transformations; groupware; Hardware; hardware-software
collaborative approaches; instrumentation; Instruments;
just in time (JIT) compiler; Monitoring; next
generation performance monitoring units; optimising
compilers; Optimization; Performance monitoring unit
(PMU); Phasor measurement units; PMUs; profiling;
profiling hooks; profiling support; Runtime; runtime
managed code; runtime managed code environments;
Software; software performance evaluation; system
monitoring",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Gibert:2015:PSR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{You:2015:QSA,
author = "Daecheol You and Ki-Seok Chung",
title = "Quality of Service-Aware Dynamic Voltage and Frequency
Scaling for Embedded {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "66--69",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2319079",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Dynamic voltage and frequency scaling (DVFS) is a key
technique for reducing processor power consumption in
mobile devices. In recent years, mobile system-on-chips
(SoCs) has supported DVFS for embedded graphics
processing units (GPUs) as the processing power of
embedded GPUs has been increasing steadily. The major
challenge of applying DVFS to a processing unit is to
meet the quality of service (QoS) requirement while
achieving a reasonable power reduction. In the case of
GPUs, the QoS requirement can be specified as the
frame-per-second (FPS) which the target GPU should
achieve. The proposed DVFS technique ensures a
consistent GPU performance by scaling the operating
clock frequency in a way that it maintains a uniform
FPS.",
acknowledgement = ack-nhfb,
affiliation = "You, D (Reprint Author), Hanyang Univ, Dept Elect Comp
\& Commun Engn, Embedded Syst Chip Lab, Seoul 133791,
South Korea. You, Daecheol; Chung, Ki-Seok, Hanyang
Univ, Dept Elect Comp \& Commun Engn, Embedded Syst
Chip Lab, Seoul 133791, South Korea.",
author-email = "khsrdc@hanyang.ac.kr kchung@hanyang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Clocks; Correlation; DVFS; dynamic
voltage scaling; embedded GPU; Energy consumption;
energy-aware systems; frequency scaling; graphics
processing unit; Graphics processing units; graphics
processing units; Graphics processors;
hardware/software interfaces; low-power design; mobile
device; mobile system-on-chips; operating clock
frequency; power aware computing; processor power
consumption; Quality of service; quality of service;
SoC; System-on-chip; system-on-chip",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "You:2015:QSA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lee:2015:RDA,
author = "Sungjin Lee and Jihong Kim and Arvind",
title = "Refactored Design of {I/O} Architecture for Flash
Storage",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "70--74",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329423",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Flash storage devices behave quite differently from
hard disk drives (HDDs); a page on flash has to be
erased before it can be rewritten, and the erasure has
to be performed on a block which consists of a large
number of contiguous pages. It is also important to
distribute writes evenly among flash blocks to avoid
premature wearing. To achieve interoperability with
existing block I/O subsystems for HDDs, NAND flash
devices employ an intermediate software layer, called
the flash translation layer (FTL), which hides these
differences. Unfortunately, FTL implementations require
powerful processors with a large amount of DRAM in
flash controllers and also incur many unnecessary I/O
operations which degrade flash storage performance and
lifetime. In this paper, we present a refactored design
of I/O architecture for flash storage which
dramatically increases storage performance and lifetime
while decreasing the cost of the flash controller. In
comparison with page-level FTL, our preliminary
experiments show a reduction of 19 percent in I/O
operations, improvement of I/O performance by 9 percent
and storage lifetime by 36 percent. In addition, our
scheme uses only 1/128 DRAM memory in the flash
controller.",
acknowledgement = ack-nhfb,
affiliation = "Lee, S (Reprint Author), MIT, 77 Massachusetts Ave,
Cambridge, MA 02139 USA. Lee, Sungjin; Arvind, MIT,
Cambridge, MA 02139 USA. Kim, Jihong, Seoul Natl Univ,
Sch Comp Sci \& Engn, Seoul, South Korea.",
author-email = "chamdoo@gmail.com jihong@davinci.snu.ac.kr
arvind@csail.mit.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; block I/O subsystems; Computer
architecture; DRAM chips; DRAM memory; file systems;
flash blocks; flash memories; flash storage; flash
translation layer; hard disk drives; HDDs; I/O
architecture; I/O architectures; input-output programs;
intermediate software layer; interoperability; NAND
circuits; NAND flash devices; NAND flash memory;
page-level FTL; Performance evaluation; premature
wearing; Random access memory; Runtime; Storage
management; Storage systems",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Lee:2015:RDA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yuan:2015:SGR,
author = "Fengkai Yuan and Zhenzhou Ji and Suxia Zhu",
title = "Set-Granular Regional Distributed Cooperative
Caching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "75--78",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2319258",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The last level cache (LLC) in private configurations
offer lower latency and isolation but extinguishes the
possibility of sharing underutilized cache resources.
Cooperative Caching (CC) provides capacity sharing by
spilling a line evicted from one cache to another.
Current studies focus on efficient capacity sharing,
while the adaptability of CC to manycore environment
deserves more attentions. In this paper, we present
Set-granular Regional Distributed Cooperative Caching
to optimize CC in manycore CMPs with private LLCs. We
achieve efficient capacity sharing by a low-traffic
global receiver tracking mechanism and provide a method
to manage set-grain cache state transitions for
exclusive LLCs. Experiment results show that SRDCC
performs better than baseline system, running different
workloads varying in receiver-spiller number and
distribution, in execution time up to 15.55 percent and
memory access up to 40.25 percent, at a negligible cost
of network traffics (6.21 percent more than baseline
system at worst).",
acknowledgement = ack-nhfb,
affiliation = "Yuan, FK (Reprint Author), Harbin Inst Technol, Sch
Comp Sci \& Technol, Harbin 150006, Heilongjiang,
Peoples R China. Yuan, Fengkai; Ji, Zhenzhou; Zhu,
Suxia, Harbin Inst Technol, Sch Comp Sci \& Technol,
Harbin 150006, Heilongjiang, Peoples R China.",
author-email = "yuan.fengkai@gmail.com jizhenzhou@hit.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence protocol; cache resource sharing;
Cache storage; cache storage; capacity sharing; CC;
chip multiprocessors; cooperative caching; Cooperative
caching; last level cache; LLC; manycore CMP;
multiprocessing systems; on-chip networks; private
cache configuration; Protocols; Radiation detectors;
receiver-spiller distribution; receiver-spiller number;
Receivers; set-grain cache state transition;
set-granular regional distributed cooperative caching;
Telecommunication traffic; Tiled CMP",
keywords-plus = "CHIP MULTIPROCESSORS",
number-of-cited-references = "9",
ORCID-numbers = "Yuan, Fengkai/0000-0003-2615-8642",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Yuan:2015:SGR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lee:2015:SSI,
author = "Junghee Lee and Youngjae Kim and Jongman Kim and Galen
M. Shipman",
title = "Synchronous {I/O} Scheduling of Independent Write
Caches for an Array of {SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "79--82",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2298394",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Solid-state drives (SSD) offer a significant
performance improvement over the hard disk drives
(HDD), however, it can exhibit a significant variance
in latency and throughput due to internal garbage
collection (GC) process on the SSD. When the SSDs are
configured in a RAID, the performance variance of
individual SSDs could significantly degrade the overall
performance of the RAID of SSDs. The internal cache on
the RAID controller can help mitigate the performance
variability issues of SSDs in the array; however, the
state-of-the-art cache algorithm of the RAID controller
does not consider the characteristics of SSDs. In this
paper, we examine the most recent write cache algorithm
for the array of disks, and propose a synchronous
independent write cache (SIW) algorithm. We also
present a pre-parity-computation technique for the RAID
of SSDs with parity computations, which calculates
parities of blocks in advance before they are stored in
the write cache. With this new technique, we propose a
complete paradigm shift in the design of write cache.
In our evaluation study, large write requests dominant
workloads show up to about 50 and 20 percent
improvements in average response times on RAID-0 and
RAID-5 respectively as compared to the state-of-the-art
write cache algorithm.",
acknowledgement = ack-nhfb,
affiliation = "Lee, J (Reprint Author), Univ Texas San Antonio, San
Antonio, TX 78229 USA. Lee, Junghee, Univ Texas San
Antonio, San Antonio, TX 78229 USA. Kim, Youngjae, Ajou
Univ, Suwon 441749, South Korea. Kim, Jongman, Georgia
Inst Technol, Atlanta, GA 30332 USA. Shipman, Galen M.,
Oak Ridge Natl Lab, Oak Ridge, TN USA.",
author-email = "junghee.lee@utsa.edu youkim@gmail.com
jkim@ece.gatech.edu gshipman@ornl.gov",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; Arrays; cache storage;
Delays; disks array; flash memory; GC process; hard
disk drives; HDD; I/O scheduling; independent write
caches; input-output programs; internal cache; internal
garbage collection process; memory architecture;
pre-parity-computation technique; RAID; RAID
controller; Redundant array of independent disks
(RAID); Redundant Array of Independent Disks (RAID);
Redundant array of independent disks (RAID);
scheduling; SIW algorithm; solid-state drive (SSD);
Solid-State Drive (SSD); solid-state drive (SSD);
solid-state drives; SSD; Strips; Synchronization;
synchronous I/O scheduling; synchronous independent
write cache algorithm; Time factors; write cache; Write
cache; write cache; write cache design; write
requests",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Lee:2015:SSI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2015:RSW,
author = "Anonymous",
title = "Rock Stars of Wearables",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "83--83",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2447192",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:RSC,
author = "Anonymous",
title = "Rock Stars of Cybersecurity 2015 Conference",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "84--84",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2447191",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:TCa,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C1--C1",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446391",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAa,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}
Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C2--C2",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446392",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAb,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}}
Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C3--C3",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446393",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C4--C4",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446394",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shi:2015:CLM,
author = "Qingchuan Shi and Henry Hoffmann and Omer Khan",
title = "A Cross-Layer Multicore Architecture to Tradeoff
Program Accuracy and Resilience Overheads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "85--89",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2365204",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To protect multicores from soft-error perturbations,
resiliency schemes have been developed with high
coverage but high power/performance overheads (similar
to 2x). We observe that not all soft-errors affect
program correctness, some soft-errors only affect
program accuracy, i.e., the program completes with
certain acceptable deviations from soft-error free
outcome. Thus, it is practical to improve processor
efficiency by trading off resilience overheads with
program accuracy. We propose the idea of declarative
resilience that selectively applies resilience schemes
to both crucial and non-crucial code, while ensuring
program correctness. At the application level, crucial
and non-crucial code is identified based on its impact
on the program outcome. The hardware collaborates with
software support to enable efficient resilience with
100 percent soft-error coverage. Only program accuracy
is compromised in the worst-case scenario of a
soft-error strike during non-crucial code execution.
For a set of multithreaded benchmarks, declarative
resilience improves completion time by an average of 21
percent over state-of-the-art hardware resilience
scheme that protects all executed code. Its performance
overhead is similar to 1.38x over a multicore that does
not support resilience.",
acknowledgement = ack-nhfb,
affiliation = "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect
\& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan;
Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn,
Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago,
Dept Comp Sci, Chicago, IL 60637 USA.",
author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu
khan@uconn.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Benchmark testing; code execution;
Instruction sets; multi-threading; multicore
architecture; Multicore processing; multicores;
multithreaded benchmark; program accuracy; Resilience;
resilience overhead; Soft errors; soft-error
perturbation; soft-errors; software architecture;
software fault tolerance",
number-of-cited-references = "23",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Shi:2015:CLM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zheng:2015:ACC,
author = "Zhong Zheng and Zhiying Wang and Mikko Lipasti",
title = "Adaptive Cache and Concurrency Allocation on
{GPGPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "90--93",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2359882",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory bandwidth is critical to GPGPU performance.
Exploiting locality in caches can better utilize memory
bandwidth. However, memory requests issued by excessive
threads cause cache thrashing and saturate memory
bandwidth, degrading performance. In this paper, we
propose adaptive cache and concurrency allocation (CCA)
to prevent cache thrashing and improve the utilization
of bandwidth and computational resources, hence
improving performance. According to locality and reuse
distance of access patterns in GPGPU program, warps on
a stream multiprocessor are dynamically divided into
three groups: cached, bypassed, and waiting. The data
cache accommodates the footprint of cached warps.
Bypassed warps cannot allocate cache lines in the data
cache to prevent cache thrashing, but are able to take
advantage of available memory bandwidth and
computational resource. Waiting warps are de-scheduled.
Experimental results show that adaptive CCA can
significant improve benchmark performance, with 80
percent harmonic mean IPC improvement over the
baseline.",
acknowledgement = ack-nhfb,
affiliation = "Zheng, Z (Reprint Author), Natl Univ Def Technol,
State Key Lab High Performance Comp, Changsha, Hunan,
Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ
Def Technol, State Key Lab High Performance Comp,
Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang,
Zhiying, Natl Univ Def Technol, Sch Comp, Changsha,
Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin,
Dept Elect \& Comp Engn, Madison, WI 54706 USA.",
author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn
mikko@engr.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC
[61070037, 61272143, 61272144, 61103016, 61202121];
NUDT [B120607]; RFDP [20114307120013]; NSF
[CCF-1318298]",
funding-text = "This work was partially supported by CSC, 863 Program
(2012AA010905), NSFC (61070037, 61272143, 61272144,
61103016, 61202121), NUDT(B120607), RFDP
(20114307120013), and NSF (CCF-1318298).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access patterns; adaptive cache-and-concurrency
allocation; Bandwidth; bandwidth utilization
improvement; benchmark performance improvement;
Benchmark testing; bypassed warps; cache; cache lines;
cache locality; Cache memory; cache storage; cache
thrashing prevention; cached warps; CCA; computational
resource utilization improvement; concurrency;
concurrency control; Concurrent computing; GPGPU; GPGPU
performance improvement; graphics processing units;
harmonic mean IPC improvement; Instruction sets; memory
bandwidth saturation; multi-threading; multiprocessing
systems; performance evaluation; Resource management;
reuse distance; stream multiprocessor; waiting warp
descheduling",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Zheng:2015:ACC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nowatzki:2015:GBP,
author = "Tony Nowatzki and Venkatraman Govindaraju and
Karthikeyan Sankaralingam",
title = "A Graph-Based Program Representation for Analyzing
Hardware Specialization Approaches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "94--98",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2476801",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware specialization has emerged as a promising
paradigm for future microprocessors. Unfortunately, it
is natural to develop and evaluate such architectures
within end-to-end vertical silos spanning application,
language/compiler, hardware design and evaluation
tools, leaving little opportunity for
cross-architecture analysis and innovation. This paper
develops a novel program representation suitable for
modeling heterogeneous architectures with specialized
hardware, called the transformable dependence graph
(TDG), which combines semantic information about
program properties and low-level hardware events in a
single representation. We demonstrate, using four
example architectures from the literature, that the TDG
is a feasible, simple, and accurate modeling technique
for transparent specialization architectures, enabling
cross-domain comparison and design-space exploration.",
acknowledgement = ack-nhfb,
affiliation = "Nowatzki, T (Reprint Author), Univ Wisconsin, Dept
Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA.
Nowatzki, Tony; Govindaraju, Venkatraman;
Sankaralingam, Karthikeyan, Univ Wisconsin, Dept Comp
Sci, Madison, WI 53706 USA.",
author-email = "tjn@cs.wisc.edu venkatra@cs.wisc.edu
karu@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerators; computer architecture;
Computer architecture; dependence graphs; graph theory;
graph-based program representation; Hardware
specialization; hardware specialization approach;
heterogeneous architecture modeling; Load modeling;
Microarchitecture; microprocessors; Microprocessors;
modelling; program representation; Specialization;
Specialization, accelerators, modelling, program
representation, dependence graphs; TDG; transformable
dependence graph; Transforms",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Nowatzki:2015:GBP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2015:PEM,
author = "Seung Hun Kim and Dohoon Kim and Changmin Lee and Won
Seob Jeong and Won Woo Ro and Jean-Luc Gaudiot",
title = "A Performance-Energy Model to Evaluate Single Thread
Execution Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "99--102",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368144",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is well known that the cost of executing the
sequential portion of a program will limit and
sometimes even eclipse the gains brought by processing
in parallel the rest of the program. This means that
serious consideration should be brought to bear on
accelerating the execution of this unavoidable
sequential part. Such acceleration can be done by
boosting the operating frequency in a symmetric
multicore processor. In this paper, we derive a
performance and power model to describe the
implications of this approach. From our model, we show
that the ratio of performance over energy during the
sequential part improves with an increase in the number
of cores. In addition, we demonstrate how to determine
with the proposed model the optimal frequency boosting
ratio which maximizes energy efficiency.",
acknowledgement = ack-nhfb,
affiliation = "Kim, SH (Reprint Author), Yonsei Univ, Sch Elect \&
Elect Engn, Seoul 120749, South Korea. Kim, Seung Hun;
Kim, Dohoon; Lee, Changmin; Jeong, Won Seob; Ro, Won
Woo, Yonsei Univ, Sch Elect \& Elect Engn, Seoul
120749, South Korea. Gaudiot, Jean-Luc, Univ Calif
Irvine, Dept Elect Engn \& Comp Sci, Irvine, CA USA.",
author-email = "kseunghun@gmail.com dohoon.kim@yonsei.ac.kr
exahz@yonsei.ac.kr ws.jeong@yonsei.ac.kr
wro@yonsei.ac.kr gaudiot@uci.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea (NRF) ---
Ministry of Education [2010-0013202]; National Science
Foundation [CCF-1439165]",
funding-text = "This work was supported in part by the Basic Science
Research Program through the National Research
Foundation of Korea (NRF) funded by the Ministry of
Education (2010-0013202) and by the National Science
Foundation, under award CCF-1439165. Any opinions,
findings, and conclusions expressed in this material
are those of the authors and do not necessarily reflect
the views of the sponsors. W. W. Ro is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "energy efficiency; Energy management; energy-aware
systems; Mathematical model; Microprocessors; Multicore
processing; multiprocessing systems; multiprocessor
systems; optimal frequency boosting ratio; parallel
processing; performance evaluation; Performance
evaluation; Performance modeling; performance-energy
model; power aware computing; Power demand; single
thread execution acceleration; symmetric multicore
processor",
keywords-plus = "AMDAHLS LAW; ERA",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2015:PEM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Song:2015:ARL,
author = "William Song and Saibal Mukhopadhyay and Sudhakar
Yalamanchili",
title = "Architectural Reliability: Lifetime Reliability
Characterization and Management of Many-Core
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "103--106",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2340873",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents a lifetime reliability
characterization of many-core processors based on a
full-system simulation of integrated microarchitecture,
power, thermal, and reliability models. Under normal
operating conditions, our model and analysis reveal
that the mean-time-to-failure of cores on the die show
normal distribution. From the processor-level
perspective, the key insight is that reducing the
variance of the distribution can improve lifetime
reliability by avoiding early failures. Based on this
understanding, we present two variance reduction
techniques for proactive reliability management; (i)
proportional dynamic voltage-frequency scaling (DVFS)
and (ii) coordinated thread swapping. A major advantage
of using variance reduction techniques is that the
improvement of system lifetime reliability can be
achieved without adding design margins or spare
components.",
acknowledgement = ack-nhfb,
affiliation = "Song, W (Reprint Author), Georgia Inst Technol, Sch
Elect \& Comp Engn, Atlanta, GA 30332 USA. Song,
William; Mukhopadhyay, Saibal; Yalamanchili, Sudhakar,
Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta,
GA 30332 USA.",
author-email = "wjhsong@gatech.edu saibal.mukhopadhyay@ece.gatech.edu
sudha.yalamanchili@ece.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Semiconductor Research Corporation
[2084.001]; IBM/SRC Graduate Fellowship; Sandia
National Laboratories",
funding-text = "This research was supported by the Semiconductor
Research Corporation under task \#2084.001, IBM/SRC
Graduate Fellowship, and Sandia National
Laboratories.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural reliability; Benchmark testing; Computer
architecture; Computer architecture, lifetime
estimation, modeling, semiconductor device reliability,
simulation; coordinated thread swapping; core
mean-time-to-failure; Degradation; design margins;
DVFS; full-system simulation; Gaussian distribution;
integrated circuit design; Integrated circuit
reliability; integrated microarchitecture; lifetime
estimation; lifetime reliability characterization;
many-core processors; Microarchitecture; microprocessor
chips; modeling; multiprocessing systems; normal
operating conditions; power aware computing; power
models; Program processors; proportional dynamic
voltage-frequency scaling; reliability models;
semiconductor device reliability; simulation; spare
components; thermal models; variance reduction
techniques",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Song:2015:ARL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Poluri:2015:SET,
author = "Pavan Poluri and Ahmed Louri",
title = "A Soft Error Tolerant Network-on-Chip Router Pipeline
for Multi-Core Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "107--110",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360686",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Network-on-Chip (NoC) paradigm is rapidly evolving
into an efficient interconnection network to handle the
strict communication requirements between the
increasing number of cores on a single chip.
Diminishing transistor size is making the NoC
increasingly vulnerable to both hard faults and soft
errors. This paper concentrates on soft errors in NoCs.
A soft error in an NoC router results in significant
consequences such as data corruption, packet
retransmission and deadlock among others. To this end,
we propose Soft Error Tolerant NoC Router (STNR)
architecture, that is capable of detecting and
recovering from soft errors occurring in different
control stages of the routing pipeline. STNR exploits
the use of idle cycles inherent in NoC packet routing
pipeline to perform time redundant executions necessary
for soft error tolerance. In doing so, STNR is able to
detect and correct all single transient faults in the
control stages of the pipeline. Simulation results
using PARSEC and SPLASH-2 benchmarks show that STNR is
able to accomplish such high level of soft error
protection with a minimal impact on latency (an
increase of 1.7 and 1.6 percent respectively).
Additionally, STNR incurs an area overhead of 7 percent
and power overhead of 13 percent as compared to the
baseline unprotected router.",
acknowledgement = ack-nhfb,
affiliation = "Poluri, P (Reprint Author), Univ Arizona, Dept Elect
\& Comp Engn, Tucson, AZ 85721 USA. Poluri, Pavan;
Louri, Ahmed, Univ Arizona, Dept Elect \& Comp Engn,
Tucson, AZ 85721 USA.",
author-email = "pavanp@email.arizona.edu louri@email.arizona.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation (NSF)
[CNS-1318997, ECCS-0725765, ECCS-1342702,
CCF-1420681]",
funding-text = "This research was supported by US National Science
Foundation (NSF) awards CNS-1318997, ECCS-0725765,
ECCS-1342702 and CCF-1420681.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; data corruption; deadlock;
fault tolerance; hard faults; idle cycles; integrated
circuit reliability; interconnection network; Multicore
processing; multicore systems; multiprocessing systems;
network routing; Network-on-chip; network-on-chip;
Network-on-chip; NoC packet routing pipeline; packet
retransmission; PARSEC; performance; Pipelines; Ports
(Computers); radiation hardening (electronics);
reliability; Resource management; single chip; single
transient faults; soft error; soft error protection;
soft error tolerance; soft error tolerant
network-on-chip router pipeline; soft error tolerant
NoC router architecture; SPLASH-2 benchmarks; STNR
architecture; Switches; time redundant executions;
Transient analysis; transistor size",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Poluri:2015:SET",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xiao:2015:SCD,
author = "Canwen Xiao and Yue Yang and Jianwen Zhu",
title = "A Sufficient Condition for Deadlock-Free Adaptive
Routing in Mesh Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "111--114",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2363829",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Deadlock remains a central problem in interconnection
network. In this paper, we establish a new theory of
deadlock-free flow control for k-ary, n-cube mesh
network, which enables the use of any minimal-path
adaptive routing algorithms while avoiding deadlock. We
prove that the proposed flow control algorithm is a
sufficient condition for deadlock freedom in any
minimal path, adaptive routing algorithms on k-ary,
n-cube mesh network.",
acknowledgement = ack-nhfb,
affiliation = "Xiao, CW (Reprint Author), Natl Univ Def Technol,
Changsha, Hunan, Peoples R China. Xiao, Canwen, Natl
Univ Def Technol, Changsha, Hunan, Peoples R China.
Yang, Yue; Zhu, Jianwen, Univ Toronto, Dept Elect \&
Comp Engn, Toronto, ON, Canada.",
author-email = "cwxiao@nudt.edu.cn yyang@eecg.toronto.edu
jzhu@eecg.toronto.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "``863'' program of China [2012AA01A301,
2013AA014301]",
funding-text = "This work is supported by ``863'' program of China
(2012AA01A301, 2013AA014301).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adaptive systems; Aerospace electronics; concurrency
control; deadlock avoidance; Deadlock-Free;
deadlock-free adaptive routing; deadlock-free flow
control; flow control; interconnection network; k-ary;
k-ary mesh network; mesh networks; Mesh networks;
minimal path routing algorithm; minimal-path adaptive
routing algorithms; Multiprocessor interconnection;
multiprocessor interconnection networks; n-cube mesh
network; Routing; sufficient condition; System
recovery; Wireless mesh networks",
number-of-cited-references = "7",
research-areas = "Computer Science",
researcherid-numbers = "Yang, Yue/N-8370-2019",
times-cited = "1",
unique-id = "Xiao:2015:SCD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mittal:2015:ATE,
author = "Sparsh Mittal and Jeffrey S. Vetter",
title = "{AYUSH}: a Technique for Extending Lifetime of
{SRAM--NVM} Hybrid Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "115--118",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2355193",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Recently, researchers have explored way-based hybrid
SRAM-NVM (non-volatile memory) last level caches (LLCs)
to bring the best of SRAM and NVM together. However,
the limited write endurance of NVMs restricts the
lifetime of these hybrid caches. We present AYUSH, a
technique to enhance the lifetime of hybrid caches,
which works by using data-migration to preferentially
use SRAM for storing frequently-reused data.
Microarchitectural simulations confirm that AYUSH
achieves larger improvement in lifetime than a previous
technique and also maintains performance and energy
efficiency. For single, dual and quad-core workloads,
the average increase in cache lifetime with AYUSH is
6.90, 24.06 and 47.62x, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Mittal, S (Reprint Author), Oak Ridge Natl Lab, Div
Math \& Comp Sci, Oak Ridge, TN 37831 USA. Mittal,
Sparsh; Vetter, Jeffrey S., Oak Ridge Natl Lab, Div
Math \& Comp Sci, Oak Ridge, TN 37831 USA.",
author-email = "mittals@ornl.gov vetter@ornl.gov",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AYUSH; Benchmark testing; Cache memory; cache storage;
data-migration; device lifetime; energy efficiency;
Energy loss; hybrid cache; last level caches;
microarchitectural simulation; Non-volatile memory
(NVM); nonvolatile memory; Nonvolatile memory;
Radiation detectors; Random access memory; SRAM; SRAM
chips; SRAM-NVM cache; SRAM-NVM hybrid caches; write
endurance",
keywords-plus = "ENERGY; MODEL",
number-of-cited-references = "17",
ORCID-numbers = "Vetter, Jeffrey/0000-0002-2449-6720 Mittal,
Sparsh/0000-0002-2908-993X",
research-areas = "Computer Science",
times-cited = "11",
unique-id = "Mittal:2015:ATE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manohar:2015:CSD,
author = "Rajit Manohar",
title = "Comparing Stochastic and Deterministic Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "119--122",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2412553",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Technology scaling has raised the specter of myriads
of cheap, but unreliable and/or stochastic devices that
must be creatively combined to create a reliable
computing system. This has renewed the interest in
computing that exploits stochasticity-embracing, not
combating the device physics. If a stochastic
representation is used to implement a programmable
general-purpose architecture akin to CPUs, GPUs, or
FPGAs, the preponderance of evidence indicates that
most of the system energy will be expended in
communication and storage as opposed to computation.
This paper presents an analytical treatment of the
benefits and drawbacks of adopting a stochastic
approach by examining the cost of representing a value.
We show both scaling laws and costs for low precision
representations. We also analyze the cost of
multiplication implemented using stochastic versus
deterministic approaches, since multiplication is the
prototypical inexpensive stochastic operation. We show
that the deterministic approach compares favorably to
the stochastic approach when holding precision and
reliability constant.",
acknowledgement = ack-nhfb,
affiliation = "Manohar, R (Reprint Author), Cornell Univ, Cornell
Tech, New York, NY 10011 USA. Manohar, Rajit, Cornell
Univ, Cornell Tech, New York, NY 10011 USA.",
author-email = "rajit@csl.cornell.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Complexity theory; Computer architecture;
deterministic computing; Encoding; field programmable
gate arrays; FPGAs; general-purpose architecture; GPUs;
graphics processing units; Logic gates; Receivers;
reliable computing system; stochastic computing;
Stochastic processes; stochastic processes; stochastic
representation",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Manohar:2015:CSD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seo:2015:DDF,
author = "Bon-Keun Seo and Seungryoul Maeng and Joonwon Lee and
Euiseong Seo",
title = "{DRACO}: a Deduplicating {FTL} for Tangible Extra
Capacity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "123--126",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2350984",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The rapid random access of SSDs enables efficient
searching of redundant data and their deduplication.
However, the space earned from deduplication cannot be
used as permanent storage because it must be reclaimed
when deduplication is cancelled as a result of an
update to the deduplicated data. To overcome this
limitation, we propose a novel FTL scheme that enables
the gained capacity to be used as permanent storage
space for the file system layer. The proposed approach
determines the safe amount of gained capacity that can
be provided to the upper layer based on the compression
rate prediction scheme. It then secures the required
space by compressing cold data when capacity overflow
occurs from cancelled deduplication. Our evaluation
with a kernel source repository showed that the file
system obtained approximately 79 percent additional
capacity by the proposed scheme.",
acknowledgement = ack-nhfb,
affiliation = "Seo, BK (Reprint Author), Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon 305701, South Korea.
Seo, Bon-Keun; Maeng, Seungryoul, Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon 305701, South Korea.
Lee, Joonwon; Seo, Euiseong, Sungkyunkwan Univ, Coll
Informat \& Commun Engn, Suwon 440746, South Korea.",
author-email = "joonwon@skku.edu euiseong@skku.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea
[2012R1A1A2A10038823]",
funding-text = "This research was supported by Basic Science Research
Program through the National Research Foundation of
Korea (2012R1A1A2A10038823).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "capacity overflow; cold data compression; compression;
compression rate prediction scheme; data compression;
data deduplication; Data structures; deduplicating FTL;
deduplication; disc drives; DRACO; Entropy; file system
layer; file systems; File systems; file systems; flash
memories; flash memory; Flash memory; flash memory;
flash translation layer; FTL; kernel source repository;
Linux; over-provisioning; permanent storage space;
rapid random access; redundant data searching; SDRAM;
SSD; storage management; storage reclamation; tangible
extra capacity",
number-of-cited-references = "6",
research-areas = "Computer Science",
researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
times-cited = "2",
unique-id = "Seo:2015:DDF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seshadri:2015:FBB,
author = "Vivek Seshadri and Kevin Hsieh and Amirali Boroum and
Donghyuk Lee and Michael A. Kozuch and Onur Mutlu and
Phillip B. Gibbons and Todd C. Mowry",
title = "Fast Bulk Bitwise {AND} and {OR} in {DRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "127--131",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2434872",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Bitwise operations are an important component of
modern day programming, and are used in a variety of
applications such as databases. In this work, we
propose a new and simple mechanism to implement bulk
bitwise AND and OR operations in DRAM, which is faster
and more efficient than existing mechanisms. Our
mechanism exploits existing DRAM operation to perform a
bitwise AND/OR of two DRAM rows completely within DRAM.
The key idea is to simultaneously connect three cells
to a bitline before the sense-amplification. By
controlling the value of one of the cells, the sense
amplifier forces the bitline to the bitwise AND or
bitwise OR of the values of the other two cells. Our
approach can improve the throughput of bulk bitwise
AND/OR operations by 9.7X and reduce their energy
consumption by 50.5.X. Since our approach exploits
existing DRAM operation as much as possible, it
requires negligible changes to DRAM logic. We evaluate
our approach using a real-world implementation of a
bit-vector based index for databases. Our mechanism
improves the performance of commonly-used range queries
by 30 percent on average.",
acknowledgement = ack-nhfb,
affiliation = "Seshadri, V (Reprint Author), Carnegie Mellon Univ,
Pittsburgh, PA 15213 USA. Seshadri, Vivek; Hsieh,
Kevin; Boroum, Amirali; Lee, Donghyuk; Mutlu, Onur;
Mowry, Todd C., Carnegie Mellon Univ, Pittsburgh, PA
15213 USA. Kozuch, Michael A.; Gibbons, Phillip B.,
Intel Pittsburgh, Pittsburgh, PA USA.",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [0953246, 1212962, 1320531]; Intel
Science and Tech. Center; Samsung; Google; Facebook;
SRC",
funding-text = "This work was supported by NSF (awards 0953246,
1212962, and 1320531), and Intel Science and Tech.
Center, Samsung, Google, Facebook, and SRC.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bit-vector based index; bitwise AND/OR; bulk-bitwise
AND operation; bulk-bitwise OR operation; Capacitors;
cell value control; Computer architecture; database
indexing; Decoding; DRAM; DRAM chips; DRAM memory; DRAM
memory, bitwise AND/OR, performance; DRAM operation;
energy consumption reduction; logic gates; performance;
performance improvement; Program processors; Random
access memory; range queries; sense amplifier;
sense-amplification; Throughput; throughput
improvement",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "21",
unique-id = "Seshadri:2015:FBB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Altaf:2015:LPM,
author = "Muhammad Shoaib Bin Altaf and David A. Wood",
title = "{LogCA}: a Performance Model for Hardware
Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "132--135",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360182",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "To address the Dark Silicon problem, architects have
increasingly turned to special-purpose hardware
accelerators to improve the performance and energy
efficiency of common computational kernels, such as
encryption and compression. Unfortunately, the latency
and overhead required to off-load a computation to an
accelerator sometimes outweighs the potential benefits,
resulting in a net decrease in performance or energy
efficiency. To help architects and programmers reason
about these trade-offs, we have developed the LogCA
model, a simple performance model for hardware
accelerators. LogCA provides a simplified abstraction
of a hardware accelerator characterized by five key
parameters. We have validated the model against a
variety of accelerators, ranging from on-chip
cryptographic accelerators in Sun's UltraSparc T2 and
Intel's Sandy Bridge to both discrete and integrated
GPUs.",
acknowledgement = ack-nhfb,
affiliation = "Bin Altaf, MS (Reprint Author), Univ Wisconsin,
Madison, WI 53706 USA. Bin Altaf, Muhammad Shoaib;
Wood, David A., Univ Wisconsin, Madison, WI 53706
USA.",
author-email = "shoaibbinalt@wisc.edu david@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CNS-1117280, CCF-1218323,
CNS-1302260]",
funding-text = "We thank Mark Hill, Michael Swift, Rathijit Sen, and
the members of the Wisconsin Multifacet group for their
comments on the paper. This work is supported in part
with NSF grants CNS-1117280, CCF-1218323, and
CNS-1302260. The views expressed herein are not
necessarily those of the NSF. Professor Wood has
significant financial interests in AMD, Google and
Panasas.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerators; compression; computational kernel;
Computational modeling; cryptography; dark silicon
problem; encryption; energy conservation; energy
efficiency; GPU; graphics processing units; Hardware
accelerators; heterogeneous systems; Intel Sandy
Bridge; LogCA model; Modeling; modeling techniques;
modeling techniques,; on-chip cryptographic
accelerator; Performance evaluation; performance model;
performance of systems; special-purpose hardware
accelerator; UltraSparc T2",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Altaf:2015:LPM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Diamantopoulos:2015:MMI,
author = "Dionysios Diamantopoulos and Sotirios Xydis and Kostas
Siozios and Dimitrios Soudris",
title = "Mitigating Memory-Induced Dark Silicon in
Many-Accelerator Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "136--139",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2410791",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Many-Accelerator (MA) systems have been introduced as
a promising architectural paradigm that can boost
performance and improve power of general-purpose
computing platforms. In this paper, we focus on the
problem of resource under-utilization, i.e., Dark
Silicon, in FPGA-based MA platforms. We show that
except the typically expected peak power budget,
on-chip memory resources form a severe
under-utilization factor in MA platforms, leading up to
75 percent of dark silicon. Recognizing that static
memory allocation-the de-facto mechanism supported by
modern design techniques and synthesis tools-forms the
main source of memory-induced Dark Silicon, we
introduce a novel framework that extends conventional
high level synthesis (HLS) with dynamic memory
management (DMM) features, enabling accelerators to
dynamically adapt their allocated memory to the runtime
memory requirements, thus maximizing the overall
accelerator count through effective sharing of FPGA's
memories resources. We show that our technique delivers
significant gains in FPGA's accelerators density, i.e.
3.8x, and application throughput up to 3.1x and 21.4x
for shared and private memory accelerators.",
acknowledgement = ack-nhfb,
affiliation = "Diamantopoulos, D (Reprint Author), Natl Tech Univ
Athens, Sch Elect \& Comp Engn, Athens, Greece.
Diamantopoulos, Dionysios; Xydis, Sotirios; Siozios,
Kostas; Soudris, Dimitrios, Natl Tech Univ Athens, Sch
Elect \& Comp Engn, Athens, Greece.",
author-email = "diamantd@microlab.ntua.gr sxydis@microlab.ntua.gr
ksiop@microlab.ntua.gr dsoudris@microlab.ntua.gr",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "E.C. [644906]",
funding-text = "This research is partially supported by the E.C.
funded program AEGLE under H2020 Grant Agreement No:
644906.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "de-facto mechanism; DMM feature; dynamic memory
management; dynamic memory management feature; Dynamic
scheduling; Field programmable gate arrays; field
programmable gate arrays; FPGA-based MA platform;
high-level synthesis; high-level synthesis tool; HLS
tool; MA system; Many-accelerator architectures;
many-accelerator architectures; Many-accelerator
architectures; Memory management; memory-induced dark
silicon source; modern design technique; Network
architecture; on-chip memory resource; peak power
budget; power aware computing; Resource management;
severe under-utilization factor; silicon; static memory
allocation; storage management; System-on-chip;
Throughput",
number-of-cited-references = "14",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847 Siozios,
Kostas/0000-0002-0285-2202",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019 Siozios,
Kostas/F-9726-2011",
times-cited = "1",
unique-id = "Diamantopoulos:2015:MMI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Poremba:2015:NUF,
author = "Matthew Poremba and Tao Zhang and Yuan Xie",
title = "{NVMain 2.0}: a User-Friendly Memory Simulator to
Model (Non-) Volatile Memory Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "140--143",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2402435",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this letter, a flexible memory simulator --- NVMain
2.0, is introduced to help the community for modeling
not only commodity DRAMs but also emerging memory
technologies, such as die-stacked DRAM caches,
non-volatile memories (e.g., STT-RAM, PCRAM, and ReRAM)
including multi-level cells (MLC), and hybrid
non-volatile plus DRAM memory systems. Compared to
existing memory simulators, NVMain 2.0 features a
flexible user interface with compelling simulation
speed and the capability of providing sub-array-level
parallelism, fine-grained refresh, MLC and data encoder
modeling, and distributed energy profiling.",
acknowledgement = ack-nhfb,
affiliation = "Poremba, M (Reprint Author), Penn State Univ, Dept
Comp Sci \& Engn, University Pk, PA 16802 USA. Poremba,
Matthew; Zhang, Tao; Xie, Yuan, Penn State Univ, Dept
Comp Sci \& Engn, University Pk, PA 16802 USA.",
author-email = "poremba@cse.psu.edu zhangtao@cse.psu.edu
yuanxie@cse.psu.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1218867, 1213052, 1409798]; Department
of Energy [DE-SC0005026]",
funding-text = "Poremba, Zhang, and Xie were supported in part by NSF
1218867, 1213052, 1409798. This material was based on
work supported by the Department of Energy under Award
Number DE-SC0005026. Matthew Poremba is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; commodity DRAM; Computational modeling;
Computer architecture; die-stacked DRAM cache; DRAM
chips; DRAM memory systems; flexible memory simulator;
flexible user interface; Memory architecture; memory
architecture; Memory architecture, random access
memory, nonvolatile memory, phase change memory, SDRAM;
Memory management; memory technology; multilevel cells;
nonvolatile memory; Nonvolatile memory; nonvolatile
memory system; NVMain 2.0; PCRAM; phase change
memories; phase change memory; Phase change random
access memory; random access memory; ReRAM; SDRAM;
STT-RAM; user interfaces; user-friendly memory
simulator",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "36",
unique-id = "Poremba:2015:NUF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Vandierendonck:2015:EEB,
author = "Hans Vandierendonck and Ahmad Hassan and Dimitrios S.
Nikolopoulos",
title = "On the Energy-Efficiency of Byte-Addressable
Non-Volatile Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "144--147",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2355195",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Non-volatile memory (NVM) technology holds promise to
replace SRAM and DRAM at various levels of the memory
hierarchy. The interest in NVM is motivated by the
difficulty faced in scaling DRAM beyond 22 nm and,
long-term, lower cost per bit. While offering higher
density and negligible static power (leakage and
refresh), NVM suffers increased latency and energy per
memory access. This paper develops energy and
performance models of memory systems and applies them
to understand the energy-efficiency of replacing or
complementing DRAM with NVM. Our analysis focusses on
the application of NVM in main memory. We demonstrate
that NVM such as STT-RAM and RRAM is energy-efficient
for memory sizes commonly employed in servers and
high-end workstations, but PCM is not. Furthermore, the
model is well suited to quickly evaluate the impact of
changes to the model parameters, which may be achieved
through optimization of the memory architecture, and to
determine the key parameters that impact system-level
energy and performance.",
acknowledgement = ack-nhfb,
affiliation = "Vandierendonck, H (Reprint Author), Queens Univ
Belfast, Belfast BT7 1NN, Antrim, North Ireland.
Vandierendonck, Hans; Nikolopoulos, Dimitrios S.,
Queens Univ Belfast, Belfast BT7 1NN, Antrim, North
Ireland. Hassan, Ahmad, SAP Belfast, Belfast, Antrim,
North Ireland.",
author-email = "h.vandierendonck@qub.ac.uk ahmad.hassan@sap.com
d.nikolopoulos@qub.ac.uk",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "People Programme (Marie Curie Actions) of
the European Union's Seventh Framework Programme
[327744]",
funding-text = "This work was supported by the People Programme (Marie
Curie Actions) of the European Union's Seventh
Framework Programme (FP7/2007-2013), grant agreement
no. 327744.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "byte-addressable nonvolatile memory technology;
Computational modeling; DRAM; DRAM chips; energy;
energy conservation; energy efficiency; Enery
efficiency; impact system-level energy; Main memory
systems; Main memory systems, non-volatile memory,
energy, modeling; Mathematical model; memory
architecture; memory hierarchy; Memory management;
memory systems; modeling; non-volatile memory;
Nonvolatile memory; NVM technology; PCM; Phase change
materials; Random access memory; RRAM; SRAM; SRAM
chips; static power; STT-RAM",
number-of-cited-references = "15",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Vandierendonck:2015:EEB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yavits:2015:RAP,
author = "Leonid Yavits and Shahar Kvatinsky and Amir Morad and
Ran Ginosar",
title = "Resistive Associative Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "148--151",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2374597",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Associative Processor (AP) combines data storage and
data processing, and functions simultaneously as a
massively parallel array SIMD processor and memory.
Traditionally, AP is based on CMOS technology, similar
to other classes of massively parallel SIMD processors.
The main component of AP is a Content Addressable
Memory (CAM) array. As CMOS feature scaling slows down,
CAM experiences scalability problems. In this work, we
propose and investigate an AP based on resistive
CAM-the Resistive AP (ReAP). We show that resistive
memory technology potentially allows scaling the AP
from a few millions to a few hundred millions of
processing units on a single silicon die. We compare
the performance and power consumption of a ReAP to a
CMOS AP and a conventional SIMD accelerator (GPU) and
show that ReAP, although exhibiting higher power
density, allows better scalability and higher
performance.",
acknowledgement = ack-nhfb,
affiliation = "Yavits, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
Yavits, Leonid; Kvatinsky, Shahar; Morad, Amir;
Ginosar, Ran, Technion Israel Inst Technol, Dept Elect
Engn, IL-3200000 Haifa, Israel.",
author-email = "yavits@txtechnion.ac.il skva@txtechnion.ac.il
amirm@txtechnion.ac.il ran@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Collaborative Research Institute for
Computational Intelligence; Hasso-Plattner-Institut",
funding-text = "The authors would like to thank Uri Weiser for
inspiring this research. This work was partially funded
by the Intel Collaborative Research Institute for
Computational Intelligence and by
Hasso-Plattner-Institut.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Associative processing; associative processor;
Associative Processor; associative processor; CAM
array; CMOS feature scaling; CMOS integrated circuits;
CMOS technology; complimentary metal oxide
semiconductor; Computer aided manufacturing; content
addressable memory array; content-addressable storage;
data processing; data storage; GPU; graphics processing
unit; in-memory computing; In-Memory Computing;
in-memory computing; massively parallel array SIMD
processor; memory function; memristor; Memristor;
memristor; Memristors; parallel processing; Random
access memory; ReAP; resistive associative processor;
resistive RAM; Resistive RAM; resistive RAM; SIMD; SIMD
accelerator",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "22",
unique-id = "Yavits:2015:RAP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kang:2015:SRT,
author = "Suk Chan Kang and Chrysostomos Nicopoulos and Ada
Gavrilovska and Jongman Kim",
title = "Subtleties of Run-Time Virtual Address Stacks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "152--155",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2337299",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The run-time virtual address (VA) stack has some
unique properties, which have garnered the attention of
researchers. The stack one-dimensionally grows and
shrinks at its top, and contains data that is seemingly
local/private to one thread, or process. Most prior
related research has focused on these properties.
However, this article aims to demonstrate how
conventional wisdom pertaining to the run-time VA stack
fails to capture some critical subtleties and
complexities. We first explore two widely established
assumptions surrounding the VA stack area: (1) Data
accesses can be classified as falling either under
VA-stack-area accesses, or non-stack-area accesses,
with no aliasing; (2) The VA stack data is completely
private and invisible to other threads/processes.
Subsequently, we summarize a representative selection
of related work that pursued the micro-architectural
concept of using run-time VA stacks to extend the
general-purpose register file. We then demonstrate why
these assumptions are invalid, by using examples from
prior work to highlight the potential hazards regarding
data consistency, shared memory consistency, and cache
coherence. Finally, we suggest safeguards against these
hazards. Overall, we explore the function-critical
issues that future operating systems and compilers
should address to effectively reap all the benefits of
using run-time VA stacks.",
acknowledgement = ack-nhfb,
affiliation = "Kang, SC (Reprint Author), Georgia Inst Technol,
Atlanta, GA 30332 USA. Kang, Suk Chan; Gavrilovska,
Ada; Kim, Jongman, Georgia Inst Technol, Atlanta, GA
30332 USA. Nicopoulos, Chrysostomos, Univ Cyprus,
CY-1678 Nicosia, Cyprus.",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence; cache storage; data consistency; data
decoupling; data integrity; data privacy;
function-critical issue; general-purpose register file;
Instruction sets; memory consistency;
microarchitectural concept; nonstack-area access;
register file; Run time; Run-time stack; run-time VA
stack data access; run-time virtual address stack;
shared memory; shared memory consistency; shared memory
systems; synonym page; VA-stack-area accesses;
Virtualization",
number-of-cited-references = "12",
ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kang:2015:SRT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rodopoulos:2015:TPV,
author = "Dimitrios Rodopoulos and Francky Catthoor and
Dimitrios Soudris",
title = "Tackling Performance Variability Due to {RAS}
Mechanisms with {PID}-Controlled {DVFS}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "156--159",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2385713",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As technology nodes approach deca-nanometer
dimensions, many phenomena threaten the binary
correctness of processor operation. Computer architects
typically enhance their designs with reliability,
availability and serviceability (RAS) schemes to
correct such errors, in many cases at the cost of extra
clock cycles, which, in turn, leads to processor
performance variability. The goal of the current paper
is to absorb this variability using Dynamic Voltage and
Frequency Scaling (DVFS). A closed-loop implementation
is proposed, which configures the clock frequency based
on observed metrics that encapsulate performance
variability due to RAS mechanisms. That way,
performance dependability and predictability is
achieved. We simulate the transient and steady state
behavior of our approach, reporting responsiveness
within less than 1 ms. We also assess our idea using
the power model of real processor and report a maximum
energy overhead of roughly 10 percent for dependable
performance in the presence of RAS temporal
overheads.",
acknowledgement = ack-nhfb,
affiliation = "Rodopoulos, D (Reprint Author), Natl Tech Univ Athens,
MicroLab, Sch Elect \& Comp Engn, Athens 15780, Greece.
Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
Univ Athens, MicroLab, Sch Elect \& Comp Engn, Athens
15780, Greece. Catthoor, Francky, ESAT KU Leuven,
Leuven, Belgium. Catthoor, Francky, SSET IMEC, Leuven,
Belgium.",
author-email = "drodo@microlab.ntua.gr catthoor@imec.be
dsoudris@microlab.ntua.gr",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "HARPA EC project [FP7-612069]",
funding-text = "The authors thank Prof. Y. Sazeides and Prof. C.
Nicopoulos of UCY, Cyprus for the insightful
discussions. They also acknowledge the constructive
feedback of the reviewers. This work was partially
supported by the FP7-612069-HARPA EC project. Dimitrios
Rodopoulos is the corresponding author. Finally, the
authors acknowledge conversations with Dr. Antonis
Papanikolaou.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "availability; availability and serviceability;
Availability and Serviceability; availability and
serviceability; binary correctness; closed loop
systems; closed-loop implementation; computer
architects; computer architecture; deca-nanometer
dimensions; Dynamic voltage and frequency scaling;
dynamic voltage and frequency scaling; Dynamic voltage
and frequency scaling; Dynamic Voltage and Frequency
Scaling; Mathematical model; microcomputers;
Performance evaluation; performance variability;
performance vulnerability factor; Performance
Vulnerability Factor; PID-controlled DVFS; Process
control; processor operation; RAS mechanisms;
reliability; Reliability; reliability; Reliability;
serviceability; three-term control; Voltage control",
number-of-cited-references = "21",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
times-cited = "4",
unique-id = "Rodopoulos:2015:TPV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Markovic:2015:TLS,
author = "Nikola Markovic and Daniel Nemirovsky and Osman Unsal
and Mateo Valero and Adrian Cristal",
title = "Thread Lock Section-Aware Scheduling on Asymmetric
Single-{ISA} Multi-Core",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "160--163",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2357805",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "As thread level parallelism in applications has
continued to expand, so has research in chip multi-core
processors. As more and more applications become
multi-threaded we expect to find a growing number of
threads executing on a machine. As a consequence, the
operating system will require increasingly larger
amounts of CPU time to schedule these threads
efficiently. Instead of perpetuating the trend of
performing more complex thread scheduling in the
operating system, we propose a scheduling mechanism
that can be efficiently implemented in hardware as
well. Our approach of identifying multi-threaded
application bottlenecks such as thread synchronization
sections complements the Fairness-aware Scheduler
method. It achieves an average speed up of 11.5 percent
(geometric mean) compared to the state-of-the-art
Fairness-aware Scheduler.",
acknowledgement = ack-nhfb,
affiliation = "Markovic, N (Reprint Author), Barcelona Supercomputing
Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky,
Daniel; Unsal, Osman; Valero, Mateo, Barcelona
Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola;
Nemirovsky, Daniel; Valero, Mateo, Univ Politecn
Cataluna, Barcelona, Spain. Cristal, Adrian, Univ
Politecn Cataluna, Barcelona Supercomputing Ctr,
E-08028 Barcelona, Spain. Cristal, Adrian, Artificial
Intelligence Res Inst Spanish Natl Res, Barcelona,
Spain.",
author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es
osman.unsal@bsc.es mateo.valero@bsc.es
adrian.cristal@bsc.es",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Asymmetric chip multiprocessor (ACMP); asymmetric
single-ISA multicore processor; chip multicore
processors; Context modeling; fairness-aware scheduler
method; HW/SW thread scheduling; Instruction sets;
microprocessor chips; multi-threaded applications;
multi-threading; Multicore processing; multiprocessing
systems; multithreaded application; operating system;
Operating systems; operating systems (computers);
scheduling; Scheduling; Synchronization; thread lock
section-aware scheduling mechanism; thread
synchronization",
number-of-cited-references = "17",
ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero,
Mateo/0000-0003-2917-2482",
research-areas = "Computer Science",
researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero,
Mateo/L-5709-2014",
times-cited = "7",
unique-id = "Markovic:2015:TLS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Pekhimenko:2015:TAC,
author = "Gennady Pekhimenko and Evgeny Bolotin and Mike
O'Connor and Onur Mutlu and Todd C. Mowry and Stephen
W. Keckler",
title = "Toggle-Aware Compression for {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "164--168",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2430853",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory bandwidth compression can be an effective way
to achieve higher system performance and energy
efficiency in modern data-intensive applications by
exploiting redundancy in data. Prior works studied
various data compression techniques to improve both
capacity (e.g., of caches and main memory) and
bandwidth utilization (e.g., of the on-chip and
off-chip interconnects). These works addressed two
common shortcomings of compression: (i)
compression/decompression overhead in terms of latency,
energy, and area, and (ii) hardware complexity to
support variable data size. In this paper, we make the
new observation that there is another important problem
related to data compression in the context of the
communication energy efficiency: transferring
compressed data leads to a substantial increase in the
number of bit toggles (communication channel switchings
from 0 to 1 or from 1 to 0). This, in turn, increases
the dynamic energy consumed by on-chip and off-chip
buses due to more frequent charging and discharging of
the wires. Our results, for example, show that the bit
toggle count increases by an average of 2.2x with some
compression algorithms across 54 mobile GPU
applications. We characterize and demonstrate this new
problem across a wide variety of 221 GPU applications
and six different compression algorithms. To mitigate
the problem, we propose two new toggle-aware
compression techniques: energy control and Metadata
Consolidation. These techniques greatly reduce the bit
toggle count impact of the six data compression
algorithms we examine, while keeping most of their
bandwidth reduction benefits.",
acknowledgement = ack-nhfb,
affiliation = "Pekhimenko, G (Reprint Author), Carnegie Mellon Univ,
Dept Comp Sci, Pittsburgh, PA 15206 USA. Pekhimenko,
Gennady; Mutlu, Onur; Mowry, Todd C., Carnegie Mellon
Univ, Dept Comp Sci, Pittsburgh, PA 15206 USA. Bolotin,
Evgeny; O'Connor, Mike; Keckler, Stephen W., NVIDA,
Santa Clara, CA USA. O'Connor, Mike; Keckler, Stephen
W., Univ Texas Austin, Austin, TX 78712 USA.",
author-email = "gpekhimento@gmail.com ebolotin@nvidia.com
moconnor@nvidia.com omutlu@gmail.com tcm@cs.cmu.edu
skeckler@nvidia.com",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Science and Technology Center for
Cloud Computing; US National Science Foundation
[1212962, 1409723, 1423172]; US Department of Energy",
funding-text = "The authors acknowledge the support of Intel Science
and Technology Center for Cloud Computing; US National
Science Foundation grants 1212962, 1409723, and
1423172; and the US Department of Energy.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bandwidth utilization; bit toggle count impact; bit
toggles; Communication channels; communication energy
efficiency; Compression algorithms;
compression/decompression overhead; Data compression;
data compression; data compression algorithms; data
compression techniques; Data compression,
interconnected systems, memory; data redundancy;
dynamic energy; energy control; graphics processing
units; Graphics processing units; hardware complexity;
interconnected systems; memory; memory bandwidth
compression; metadata consolidation; Mobile
communication; mobile GPU applications; modern
data-intensive applications; off-chip buses; on-chip
buses; power aware computing; System-on-chip;
toggle-aware compression; variable data size",
number-of-cited-references = "29",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Pekhimenko:2015:TAC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2015:TCb,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C1--C1",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510172",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAc,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}
Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C2--C2",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510173",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAd,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}}
Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C3--C3",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510174",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICSb,
author = "Anonymous",
title = "{IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C4--C4",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510176",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Wu:2016:MCN,
author = "Wo-Tak Wu and Ahmed Louri",
title = "A Methodology for Cognitive {NoC} Design",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2447535",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The number of cores in a multicore chip design has
been increasing in the past two decades. The rate of
increase will continue for the foreseeable future. With
a large number of cores, the on-chip communication has
become a very important design consideration. The
increasing number of cores will push the communication
complexity level to a point where managing such highly
complex systems requires much more than what designers
can anticipate for. We propose a new design methodology
for implementing a cognitive network-on-chip that has
the ability to recognize changes in the environment and
to learn new ways to adapt to the changes. This
learning capability provides a way for the network to
manage itself. Individual network nodes work
autonomously to achieve global system goals, e.g., low
network latency, higher reliability, power efficiency,
adaptability, etc. We use fault-tolerant routing as a
case study. Simulation results show that the cognitive
design has the potential to outperform the conventional
design for large applications. With the great inherent
flexibility to adopt different algorithms, the
cognitive design can be applied to many applications.",
acknowledgement = ack-nhfb,
affiliation = "Wu, WT (Reprint Author), Univ Arizona, Dept Elect \&
Comp Engn, Tucson, AZ 85721 USA. Wu, Wo-Tak; Louri,
Ahmed, Univ Arizona, Dept Elect \& Comp Engn, Tucson,
AZ 85721 USA.",
author-email = "wotakwu@email.arizona.edu louri@ece.arizona.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive; Algorithm design and analysis; cognitive
network-on-chip; cognitive NoC design; cognitive
process; communication complexity; communication
complexity level; Fault tolerance; fault tolerant
computing; Fault tolerant systems; fault-tolerant;
fault-tolerant routing; individual network nodes;
integrated circuit design; intelligent agent; learning
(artificial intelligence); learning capability; machine
learning; multicore; multicore chip design; Multicore
processing; multiprocessing systems; network routing;
network-on-chip; NoC; on-chip communication; Routing;
Software",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wu:2016:MCN",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2016:IICa,
author = "Anonymous",
title = "2015 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 14",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "1--6",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2513858",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Anonymous:2016:IICb,
author = "Anonymous",
title = "2015 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 14",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "1--6",
month = jan # "\slash " # jun,
year = "2016",
DOI = "https://doi.org/10.1109/LCA.2015.2513858",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 08:36:31 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the 2015 author/subject index for this
publication.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Rezaei:2016:DRS,
author = "Seyyed Hossein Seyyedaghaei Rezaei and Abbas Mazloumi
and Mehdi Modarressi and Pejman Lotfi-Kamran",
title = "Dynamic Resource Sharing for High-Performance {$3$-D}
Networks-on-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2448532",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "3D logic-on-logic technology is a promising approach
for extending the validity of Moore's law when
technology scaling stops. 3D technology can also lead
to a paradigm shift in on-chip communication design by
providing orders of magnitude higher bandwidth and
lower latency for inter-layer communication. To turn
the 3D technology bandwidth and latency benefits into
network latency reductions and performance improvement,
we need networks-on-chip (NoCs) that are specially
designed to take advantage of what 3D technology has to
offer. While in parallel workloads many packets
experience blocking in the network due to losing
arbitration for crossbars' input/output ports, we
observe that in a considerable fraction of these cases
in a 3D NoC, the corresponding input and output ports
of the crossbar in the above or below router are idle.
Given this observation, we propose FRESH, a router
microarchitecture with Fine-grained 3D REsource SHaring
capability that leverages the ultra-low latency
vertical links of a 3D chip to share crossbars and
links at a fine granularity between vertically stacked
routers. It enables packets that lose arbitration for
crossbars' input/output ports to use idle resources of
the above or below routers, and effectively eliminates
the unnecessary packet blocking time. We will show that
our proposal lowers network latency by up to 21 percent
over the state-of-the-art 3D NoC.",
acknowledgement = ack-nhfb,
affiliation = "Rezaei, SHS (Reprint Author), Univ Tehran, Coll Engn,
Dept Elect \& Comp Engn, Tehran, Iran. Rezaei, Seyyed
Hossein Seyyedaghaei; Mazloumi, Abbas; Modarressi,
Mehdi, Univ Tehran, Coll Engn, Dept Elect \& Comp Engn,
Tehran, Iran. Lotfi-Kamran, Pejman, Inst Res
Fundamental Sci IPM, Sch Comp Sci, Tehran, Iran.",
author-email = "s.hseyyedaghaei@ut.ac.ir y.mazloomi@gmail.com
modarressi@ut.ac.ir plotfi@ipm.ir",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3-D integration; 3D integration; 3D networks-on-chip;
3D NoC; Bandwidth; crossbars input-output ports;
fine-grained 3D resource sharing capability; FRESH;
network latency; network routing; network-on-chip;
Ports (Computers); Resource management; Resource
sharing; router microarchitecture; Routing; Switches;
Three-dimensional displays; Through-silicon vias",
keywords-plus = "3D; ROUTER",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Rezaei:2016:DRS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gorgues:2016:EPC,
author = "Miguel Gorgues and Jose Flich",
title = "End-Point Congestion Filter for Adaptive Routing with
Congestion-Insensitive Performance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2429130",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Interconnection networks are a critical component in
most modern systems nowadays. Both off-chip networks,
in HPC systems, data centers, and cloud servers, and
on-chip networks, in chip multiprocessors (CMPs) and
multiprocessors system-on-chip (MPSoCs), play an
increasing role as their performance is vital for the
performance of the whole system. One of the key
components of any interconnect is the routing
algorithm, which steers packets through the network.
Adaptive routing algorithms have demonstrated their
superior performance by maximizing network resources
utilization. However, as systems increase in size (both
in off-chip and on-chip), new problems emerge. One of
them is congestion where traffic jams inside the
network lead to low throughput and high packet latency,
significantly impacting overall system performance. We
propose a mechanism to eradicate this phenomena and to
allow adaptive routing algorithms to achieve the
expected performance even in the presence of congestion
situations. End-Point Congestion Filter, EPC, detects
congestion formed at the end-points of the network, and
prevents the congestion from spreading through the
network. Basically, EPC disables adaptivity in
congested packets. Preliminary results for mid and high
congestion situations show EPC is able to totally
decouple congestion from routing.",
acknowledgement = ack-nhfb,
affiliation = "Gorgues, M (Reprint Author), Univ Politecn Valencia,
Dept Comp Architecture, E-46022 Valencia, Spain.
Gorgues, Miguel; Flich, Jose, Univ Politecn Valencia,
Dept Comp Architecture, E-46022 Valencia, Spain.",
author-email = "migoral@disca.upv.es jflich@disca.upv.es",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adaptive filters; Adaptive routing algorithms;
adaptive routing algorithms; Adaptive routing
algorithms; adaptive routing algorithms; Adaptive
routing algorithms; Adaptive systems; chip
multiprocessors; cloud servers; CMP; Congestion;
congestion; Congestion; congestion;
congestion-insensitive performance; data centers;
digital filters; end-point congestion filter; EPC; HPC
systems; Information filters; interconnection networks;
interconnects; MPSoC; multiprocessor interconnection
networks; multiprocessors system-on-chip; network
resources utilization; network routing; on-chip
networks; packet latency; performance evaluation; Ports
(Computers); Routing; system-on-chip; Throughput;
traffic jams",
keywords-plus = "NETWORKS",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Gorgues:2016:EPC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Panda:2016:EPP,
author = "Biswabandan Panda and Shankar Balachandran",
title = "Expert Prefetch Prediction: an Expert Predicting the
Usefulness of Hardware Prefetchers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2428703",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware prefetching improves system performance by
hiding and tolerating the latencies of lower levels of
cache and off-chip DRAM. An accurate prefetcher
improves system performance whereas an inaccurate
prefetcher can cause cache pollution and consume
additional bandwidth. Prefetch address filtering
techniques improve prefetch accuracy by predicting the
usefulness of a prefetch address and based on the
outcome of the prediction, the prefetcher decides
whether or not to issue a prefetch request. Existing
techniques use only one signature to predict the
usefulness of a prefetcher but no single predictor
works well across all the applications. In this work,
we propose weighted-majority filter, an expert way of
predicting the usefulness of prefetch addresses. The
proposed filter is adaptive in nature and uses the
prediction of the best predictor(s) from a pool of
predictors. Our filter is orthogonal to the underlying
prefetching algorithm. We evaluate the effectiveness of
our technique on 22 SPEC-2000/2006 applications. On an
average, when employed with three state-of-the-art
prefetchers such as AMPM, SMS, and GHB-PC/DC, our
filter provides performance improvement of 8.1, 9.3,
and 11 percent respectively.",
acknowledgement = ack-nhfb,
affiliation = "Panda, B (Reprint Author), Indian Inst Technol, Dept
Comp Sci \& Engn, Madras, Tamil Nadu, India. Panda,
Biswabandan; Balachandran, Shankar, Indian Inst
Technol, Dept Comp Sci \& Engn, Madras, Tamil Nadu,
India.",
author-email = "biswa.uce@gmail.com",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; AMPM; cache; Cache; cache; Cache; cache;
cache storage; filtering theory; GHB-PC/DC; Hardware;
hardware prefetchers; Hardware prefetching; Hardware
Prefetching; Hardware prefetching; Hardware
Prefetching; Memory systems; memory systems; Memory
systems; memory systems; Pollution; Prediction
algorithms; prefetch addresses; Prefetching;
prefetching algorithm; Radiation detectors; Random
access memory; SMS; weighted-majority filter",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Panda:2016:EPP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Eker:2016:EEC,
author = "Abdulaziz Eker and O{\u{g}}uz Ergin",
title = "Exploiting Existing Copies in Register File for Soft
Error Correction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2435705",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Soft errors are an increasingly important problem in
contemporary digital systems. Being the major data
holding component in contemporary microprocessors, the
register file has been an important part of the
processor on which researchers offered many different
schemes to protect against soft errors. In this paper
we build on the previously proposed schemes and start
with the observation that many register values already
have a replica inside the storage space. We use this
already available redundancy inside the register file
in combination with a previously proposed value
replication scheme for soft error detection and
correction. We show that, by employing schemes that
make use of the already available copies of the values
inside the register file, it is possible to detect and
correct 39.0 percent of the errors with an additional
power consumption of 18.9 percent.",
acknowledgement = ack-nhfb,
affiliation = "Eker, A (Reprint Author), TOBB Univ Econ \& Technol,
Dept Comp Engn, Ankara, Turkey. Eker, Abdulaziz; Ergin,
O{\u{g}}uz, TOBB Univ Econ \& Technol, Dept Comp Engn,
Ankara, Turkey.",
author-email = "aeker@etu.edu.tr oergin@etu.edu.tr",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "TUBITAK [112E004]",
funding-text = "This work was supported in part by TUBITAK under Grant
112E004. The work is in the framework of COST Action
1103.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; contemporary digital systems;
contemporary microprocessors; data holding component;
Error correction; Error correction codes;
microcomputers; microprocessor architecture;
Microprocessors; Parity check codes; redundancy;
register file; Registers; Reliability; soft error; soft
error correction; soft error detection; storage space",
number-of-cited-references = "16",
ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787",
research-areas = "Computer Science",
researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010",
times-cited = "1",
unique-id = "Eker:2016:EEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Maycock:2016:HES,
author = "Matthew Maycock and Simha Sethumadhavan",
title = "Hardware Enforced Statistical Privacy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2403359",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The Internet of Things will result in users generating
vast quantities of data, some of it sensitive. Results
from the statistical analysis of sensitive data across
wide ranges of demographics will become ever more
useful to data analysts and their clients. The
competing needs of the two groups-data generators with
their desire for privacy and analysts with their desire
for inferred statistics-will be met through the use of
statistical privacy techniques. The question, then, is
how can we ensure that the statistical methods are
applied in a trustable manner? In this paper we discuss
some of the complications and consequences of ensuring
both trust and privacy through the immutability of
hardware, providing a desiderata for a hardware privacy
platform.",
acknowledgement = ack-nhfb,
affiliation = "Maycock, M (Reprint Author), Columbia Univ, Dept Comp
Sci, CASTL, New York, NY 10027 USA. Maycock, Matthew;
Sethumadhavan, Simha, Columbia Univ, Dept Comp Sci,
CASTL, New York, NY 10027 USA.",
author-email = "mhm2159@columbia.edu simha@columbia.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Alfred P. Sloan Foundation;
[FA8750-10-2-0253]",
funding-text = "This work was supported through grant FA8750-10-2-0253
and the Alfred P. Sloan Foundation. Opinions, findings,
conclusions and recommendations expressed in this
material are those of the authors and may not reflect
the views of the funding entities. Simha Sethumadhavan
is the corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "data analysis; Data privacy; data privacy; Data
privacy; Engines; Hardware; hardware enforced
statistical privacy; hardware immutability; hardware
support; Hardware Support; hardware support; Hardware
Support; hardware support; Internet of things; Internet
of Things; Internet of things; Internet of Things;
Internet of things; Internet of Things; Noise; Privacy;
privacy; Privacy; privacy; Privacy; privacy; Privacy;
privacy protection unit; Privacy Protection Unit;
privacy protection unit; Privacy Protection Unit;
privacy protection unit; Security; sensitive data;
Software; statistical analysis",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Maycock:2016:HES",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Li:2016:ICL,
author = "Dongdong Li and Tor M. Aamodt",
title = "Inter-Core Locality Aware Memory Scheduling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2435709",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Graphics Processing Units (GPUs) run thousands of
parallel threads and achieve high Memory Level
Parallelism (MLP). To support high Memory Level
Parallelism, a structure called a Miss-Status Holding
Register (MSHR) handles multiple in-flight miss
requests. When multiple cores send requests to the same
cache line, the requests are merged into one last level
cache MSHR entry and only one memory request is sent to
the Dynamic Random-Access Memory (DRAM). We call this
inter-core locality. The main reason for inter-core
locality is that multiple cores access shared read-only
data within the same cache line. By prioritizing memory
requests that have high inter-core locality, more
threads resume execution. In this paper, we analyze the
reason for inter-core locality and show that requests
with inter-core locality are more critical to
performance. We propose a GPU DRAM scheduler that
exploits information about inter-core locality detected
at the last level cache MSHRs. For high inter-core
locality benchmarks this leads to an average 28 percent
reduction in memory request latency and 11 percent
improvement in performance.",
acknowledgement = ack-nhfb,
affiliation = "Li, DD (Reprint Author), Univ British Columbia, Dept
Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada. Li,
Dongdong; Aamodt, Tor M., Univ British Columbia, Dept
Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada.",
author-email = "dongdong@ece.ubc.ca aamodt@ece.ubc.ca",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Benchmark testing; cache line; cache
storage; Computational fluid dynamics; DRAM chips;
dynamic random-access memory; GPGPU; GPU DRAM
scheduler; graphics processing units; Graphics
processing units; graphics processing units; Graphics
processing units; graphics processing units;
Instruction sets; intercore locality aware memory
scheduling; last level cache MSHR entry; locality;
Locality; locality; Locality; locality; memory access
scheduling; Memory Access Scheduling; memory access
scheduling; Memory Access Scheduling; memory level
parallelism; memory request; memory request latency;
miss-status holding register; MLP; multiple cores;
multiple in-flight miss requests; multiprocessing
systems; parallel processing; parallel threads;
Processor scheduling; processor scheduling; Processor
scheduling; processor scheduling; Random access memory;
read-only data",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Li:2016:ICL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Pu:2016:NIP,
author = "Libei Pu and Kshitij Doshi and Ellis Giles and Peter
Varman",
title = "Non-Intrusive Persistence with a Backend {NVM}
Controller",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2443105",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "By providing instruction-grained access to vast
amounts of persistent data with ordinary loads and
stores, byte-addressable storage class memory (SCM) has
the potential to revolutionize system architecture. We
describe a non-intrusive SCM controller for achieving
light-weight failure atomicity through back-end
operations. Our solution avoids costly software
intervention by decoupling isolation and
concurrency-driven atomicity from failure atomicity and
durability, and does not require changes to the
front-end cache hierarchy. Two implementation
alternatives --- one using a hardware structure, and
the other extending the memory controller with a
firmware managed volatile space, are described.",
acknowledgement = ack-nhfb,
affiliation = "Pu, LB (Reprint Author), Rice Univ, ECE, Houston, TX
77005 USA. Pu, Libei; Giles, Ellis; Varman, Peter, Rice
Univ, ECE, Houston, TX 77005 USA. Doshi, Kshitij,
Intel, SSG, Phoenix, AZ 85226 USA.",
author-email = "pulibei@gmail.com kshitij.a.doshi@intel.com
erg@rice.edu pjv@rice.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation (NSF) [CCF
1439075]; Intel Software and Services Group",
funding-text = "This paper is supported by the US National Science
Foundation (NSF) Grant CCF 1439075 and by Intel
Software and Services Group.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "atomicity; backend NVM controller; byte-addressable
storage class memory; cache storage; concurrency-driven
atomicity; consistency; durability; failure analysis;
firmware; firmware managed volatile space; front-end
cache hierarchy; Hardware; hardware structure;
instruction-grained access; isolation decoupling;
light-weight failure atomicity; memory architecture;
Memory management; Non-volatile memory; nonintrusive
persistence; nonintrusive SCM controller; Nonvolatile
memory; persistent memory; Process control; Random
access memory; random-access storage; Retirement;
Software; software intervention; system architecture",
keywords-plus = "SYSTEM",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Pu:2016:NIP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Garcia:2016:CMP,
author = "P. Garcia and T. Gomes and J. Monteiro and A. Tavares
and M. Ekpanyapong",
title = "On-Chip Message Passing Sub-System for Embedded
Inter-Domain Communication",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2419260",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "This letter describes the architecture of an
inter-domain message passing hardware sub-system
targeting the embedded virtualization field. Embedded
virtualization is characterized by application-specific
solutions, where functionality is partitioned into a
small, fixed number of Virtual Machines, typically
under real-time constraints, which must communicate for
synchronization and status signaling. In light of the
growing use of custom hardware, especially supported by
(re)configurable platforms, we show how our hardware
sub-system can provide virtualization-safe data
transfers, without the need for Hypervisor (software)
mediation, through the use of translate-once and
virtual-interface hardware mechanisms, allowing direct
memory-to-memory copies between different partitions'
input/output buffers, in both direct-transfer and
publish-subscribe modes. Our experiments show our
architecture is especially suited for the real time
domain, outperforming an equivalent software solution
in latencies, throughput and jitter, and outperforming
state of the art hardware solutions for small message
sizes ($ < 512 $ B).",
acknowledgement = ack-nhfb,
affiliation = "Garcia, P (Reprint Author), Univ Minho, Dept Ctr
Algoritmi, P-4800 Braga, Portugal. Garcia, P.; Gomes,
T.; Monteiro, J.; Tavares, A., Univ Minho, Dept Ctr
Algoritmi, P-4800 Braga, Portugal. Ekpanyapong, M.,
Asian Inst Technol, Dept Microelect \& Embedded Syst,
Khlong Luang, Thailand.",
author-email = "pgarcia@dei.uminho.pt tgomes@dei.uminho.pt
jmonteiro@dei.uminho.pt atavares@dei.uminho.pt
mongkol@ait.ac.th",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "FCT [SFRH/BD/77813/2011]",
funding-text = "This work was supported in part by a grant from FCT,
reference SFRH/BD/77813/2011. P. Garcia is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application program interfaces; application-specific
solutions; configurable platforms; direct
memory-to-memory copies; direct-transfer modes;
embedded interdomain communication; embedded systems;
embedded virtualization field; Hardware; interdomain
message passing hardware subsystem; message passing;
Message passing; on-chip message passing subsystem;
partition input/output buffers; publish subscribe
modes; Publish-subscribe; real time domain; real-time
constraints; Software; status signaling;
synchronisation; synchronization; Throughput;
translate-once mechanism; Virtual machine monitors;
virtual machines; virtual-interface hardware
mechanisms; virtualisation; Virtualization;
virtualization-safe data transfers",
number-of-cited-references = "15",
ORCID-numbers = "Monteiro, Joao L/0000-0002-3287-3995 Monteiro,
Joao/0000-0002-3287-3995 Tavares,
Adriano/0000-0001-8316-6927 Gomes,
Tiago/0000-0002-8496-8179 Garcia,
Paulo/0000-0002-1041-5205",
research-areas = "Computer Science",
researcherid-numbers = "Monteiro, Joao L/H-7751-2012 Monteiro,
Joao/Q-6857-2019 Tavares, Adriano/M-5257-2013",
times-cited = "1",
unique-id = "Garcia:2016:CMP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Li:2016:PHP,
author = "Minghua Li and Guancheng Chen and Qijun Wang and
Yonghua Lin and Peter Hofstee and Per Stenstrom and
Dian Zhou",
title = "{PATer}: a Hardware Prefetching Automatic Tuner on
{IBM} {POWER8} Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2442972",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware prefetching on IBM's latest POWER8 processor
is able to improve performance of many applications
significantly, but it can also cause performance loss
for others. The IBM POWER8 processor provides one of
the most sophisticated hardware prefetching designs
which supports 225 different configurations. Obviously,
it is a big challenge to find the optimal or
near-optimal hardware prefetching configuration for a
specific application. We present a dynamic prefetching
tuning scheme in this paper, named prefetch automatic
tuner (PATer). PATer uses a prediction model based on
machine learning to dynamically tune the prefetch
configuration based on the values of hardware
performance monitoring counters (PMCs). By developing a
two-phase prefetching selection algorithm and a
prediction accuracy optimization algorithm in this
tool, we identify a set of selected key hardware
prefetch configurations that matter mostly to
performance as well as a set of PMCs that maximize the
machine learning prediction accuracy. We show that
PATer is able to accelerate the execution of diverse
workloads up to $ 1.4 \times $.",
acknowledgement = ack-nhfb,
affiliation = "Li, MH (Reprint Author), Unvers Texas Dallas, Dept
Elect Engn, Richardson, TX 75080 USA. Li, MH (Reprint
Author), IBM Res China, Beijing, Peoples R China. Li,
Minghua; Zhou, Dian, Unvers Texas Dallas, Dept Elect
Engn, Richardson, TX 75080 USA. Li, Minghua; Chen,
Guancheng; Wang, Qijun; Lin, Yonghua, IBM Res China,
Beijing, Peoples R China. Hofstee, Peter, IBM Corp,
ARL, Austin, TX USA. Stenstrom, Per, Chalmers, Dept Sci
\& Comp Engn, Gothenburg, Sweden.",
author-email = "mxl095420@utdallas.edu chengc@cn.ibm.com
wqijun@cn.ibm.com linyh@cn.ibm.com hofstee@us.ibm.com
pers@chalmers.se zhoud.utdallas@gmail.com",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "IBM Research global summer intern program",
funding-text = "The authors would like to thank the anonymous
reviewers for their valuable suggestions and comments
to improve the paper. The authors also want to thank
Ling Shao, Xiaowei Shen, Qi Guo, Kun Wang, Tao Liu, Yan
Li from IBM Research, and Sally A. Mckee from Chalmers
for their insightful suggestions. Minghua Li was
supported by IBM Research global summer intern
program.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Benchmark testing; Classifier design and
evaluation; Classifier design and evaluation, machine
learning, optimization, performance measures; Hardware;
hardware PMC; hardware prefetching automatic tuner; IBM
POWER8 processor; learning (artificial intelligence);
machine learning; multiprocessing systems;
Optimization; optimization; Optimization; PATer;
performance evaluation; performance measures;
performance monitoring counters; prediction accuracy
optimization algorithm; prefetch automatic tuner;
Prefetching; Runtime; storage management; Training;
two-phase prefetching selection algorithm",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Li:2016:PHP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Alian:2016:PGS,
author = "Mohammad Alian and Daehoon Kim and Nam Sung Kim",
title = "{pd-gem5}: Simulation Infrastructure for
Parallel\slash Distributed Computer Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "41--44",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2438295",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Improving the performance and power efficiency of a
single processor has been fraught with various
challenges stemming from the end of the classical
technology scaling. Thus, the importance of efficiently
running applications on a parallel/distributed computer
system has continued to increase. In developing and
optimizing such a parallel/distributed computer system,
it is critical to study the impact of the complex
interplay amongst processor, node, and network
architectures on performance and power efficiency in
detail. This necessitates a flexible, detailed and
open-source full-system simulation infrastructure.
However, our community lacks such an infrastructure. In
this paper, we present pd-gem5, a gem5-based
infrastructure that can model and simulate a parallel/
distributed computer system using multiple simulation
hosts. Our experiment shows that pd-gem5 running on six
simulation hosts speeds up the simulation of a 24-node
computer system up to $ 3.2 \times $ compared with
running on a single simulation host.",
acknowledgement = ack-nhfb,
affiliation = "Kim, NS (Reprint Author), Univ Illinois, ECE Dept,
Urbana, IL 61801 USA. Alian, Mohammad; Kim, Daehoon;
Kim, Nam Sung, Univ Illinois, ECE Dept, Urbana, IL
61801 USA.",
author-email = "nskim@illinois.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CNS-1217102, CNS-1512981]; DARPA
[HR0011-12-2-0019]",
funding-text = "This work was supported in part by NSF (CNS-1217102
and CNS-1512981) and DARPA (HR0011-12-2-0019) grants.
Nam Sung Kim has a financial interest in Samsung
Electronics and AMD. Daehoon Kim is the corresponding
author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computational modeling; digital
simulation; gem5; Handheld computers; Load modeling;
multiple simulation hosts; network; open-source
full-system simulation infrastructure; parallel
processing; parallel/distributed computer systems;
parallel/distributed simulation; pd-gem5; power aware
computing; public domain software; single processor
performance; single processor power efficiency; single
simulation host; Switches; Synchronization; technology
scaling",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Alian:2016:PGS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2016:RFE,
author = "Yoongu Kim and Weikun Yang and Onur Mutlu",
title = "{Ramulator}: a Fast and Extensible {DRAM} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "45--49",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2414456",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Recently, both industry and academia have proposed
many different roadmaps for the future of DRAM.
Consequently, there is a growing need for an extensible
DRAM simulator, which can be easily modified to judge
the merits of today's DRAM standards as well as those
of tomorrow. In this paper, we present Ramulator, a
fast and cycle-accurate DRAM simulator that is built
from the ground up for extensibility. Unlike existing
simulators, Ramulator is based on a generalized
template for modeling a DRAM system, which is only
later infused with the specific details of a DRAM
standard. Thanks to such a decoupled and modular
design, Ramulator is able to provide out-of-the-box
support for a wide array of DRAM standards: DDR3/4,
LPDDR3/4, GDDR5, WIO1/2, HBM, as well as some academic
proposals (SALP, AL-DRAM, TL-DRAM, RowClone, and SARP).
Importantly, Ramulator does not sacrifice simulation
speed to gain extensibility: according to our
evaluations, Ramulator is $ 2.5 \times $ faster than
the next fastest simulator. Ramulator is released under
the permissive BSD license.",
acknowledgement = ack-nhfb,
affiliation = "Kim, Y (Reprint Author), Carnegie Mellon Univ, Dept
Elect \& Comp Engn, Pittsburgh, PA 15213 USA. Kim,
Yoongu; Mutlu, Onur, Carnegie Mellon Univ, Dept Elect
\& Comp Engn, Pittsburgh, PA 15213 USA. Yang, Weikun,
Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Yang,
Weikun, Peking Univ, Dept Comp Sci, Beijing, Peoples R
China.",
author-email = "yoongu.kim@gmail.com wkyjyy@gmail.com
omutlu@gmail.com",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF; SRC",
funding-text = "We thank the SAFARI group members who have contributed
to the development of Ramulator, including Kevin Chang,
Saugata Ghose, Donghyuk Lee, Tianshi Li, and Vivek
Seshadri. We also thank the anonymous reviewers for
feedback. This work was supported by NSF, SRC, and
gifts from our industrial partners, including Google,
Intel, Microsoft, Nvidia, Samsung, Seagate and VMware.
Ramulator can be freely downloaded from
https://github.com/CMUSAFARI/ramulator",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "circuit simulation; digital simulation; DRAM; DRAM
chips; DRAM simulator; DRAM standard; emerging
technologies; experimental methods; Hardware design
languages; Main memory; memory scaling; memory systems;
Nonvolatile memory; performance evaluation; performance
evaluation, experimental methods, emerging
technologies, memory systems, memory scaling;
Proposals; Ramulator; Random access memory; Runtime;
simulation; software tool; standards; Standards;
standards; Timing",
keywords-plus = "LATENCY DRAM; RETHINKING",
number-of-cited-references = "38",
research-areas = "Computer Science",
times-cited = "29",
unique-id = "Kim:2016:RFE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Olson:2016:SIT,
author = "Lena E. Olson and Simha Sethumadhavan and Mark D.
Hill",
title = "Security Implications of Third-Party Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "50--53",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2445337",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Third-party accelerators offer system designers high
performance and low energy without the market delay of
in-house development. However, complex third-party
accelerators may include vulnerabilities due to design
flaws or malicious intent that are hard to expose
during verification. Rather than react to each new
vulnerability, it is better to proactively build
defenses for classes of attacks. To inspire future work
on defenses, this paper develops a taxonomy of
accelerator vulnerabilities. We consider the cross
product of threat types (confidentiality, integrity,
and availability) with risk categories (configuration,
computation, termination, accelerator memory accesses,
system memory accesses, microarchitecture/coherence,
exceptions/interrupts, and power), as well as whether
processes can be vulnerable only if they use the
offending accelerator (accelerator-scope threat) or
even when running in the same system (system-scope
threat). Our taxonomy draws attention to a grave
problem that needs immediate attention from computer
architects.",
acknowledgement = ack-nhfb,
affiliation = "Olson, LE (Reprint Author), Univ Wisconsin, Dept Comp
Sci, 1210 W Dayton St, Madison, WI 53706 USA. Olson,
Lena E.; Hill, Mark D., Univ Wisconsin, Dept Comp Sci,
1210 W Dayton St, Madison, WI 53706 USA. Sethumadhavan,
Simha, Columbia Univ, Dept Comp Sci, New York, NY 10026
USA.",
author-email = "lena@cs.wisc.edu simha@cs.columbia.edu
markhill@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1054844]; Alfred P. Sloan Foundation;
[FA8750-10-2-0253]; [FA8650-11-C-7190]",
funding-text = "This work is supported through grants
FA8750-10-2-0253, FA8650-11-C-7190, NSF 1054844 and the
Alfred P. Sloan Foundation. Opinions, findings,
conclusions and recommendations expressed in this
material are those of the authors and may not reflect
the views of the funding entities. The authors thank
Eric Sedlar, Dan Gibson, Multifacet, and UW-Madison
Computer Architecture Affiliates for valuable
feedback.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator architectures; accelerator
vulnerabilities; accelerator-scope threat; Coherence;
computer architecture; Computer bugs; Computer
security; Cryptography; Hardware; malicious intent;
market delay; Registers; risk categories; risk
management; system-scope threat; Taxonomy; third-party
accelerators",
number-of-cited-references = "20",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Olson:2016:SIT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jacob:2016:CVC,
author = "Bruce Jacob",
title = "The Case for {VLIW--CMP} as a Building Block for
Exascale",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "54--57",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2424699",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Current ultra-high-performance computers execute
instructions at the rate of roughly 10 PFLOPS (10
quadrillion floating-point operations per second) and
dissipate power in the range of 10 MW. The next
generation will need to execute instructions at EFLOPS
rates-100x as fast as today's-but without dissipating
any more power. To achieve this challenging goal, the
emphasis is on power-efficient execution, and for this
we propose VLIW-CMP as a general architectural approach
that improves significantly on the power efficiency of
existing solutions. Compared to manycore architectures
using simple, single-issue cores, VLIW-CMP reduces both
power and die area, improves single-thread performance,
and maintains aggregate FLOPS per die. To improve
further on the power advantages of VLIW, we describe a
mechanism that reduces power dissipation of both data
forwarding and register-file activity.",
acknowledgement = ack-nhfb,
affiliation = "Jacob, B (Reprint Author), Univ Maryland, Dept Elect
\& Comp Engn, College Pk, MD 20742 USA. Jacob, Bruce,
Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD
20742 USA.",
author-email = "blj@ece.umd.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Computer architectures;
Computer architectures, high-performance computing,
energy efficiency, multicore; data forwarding activity;
EFLOPS rates; energy efficiency; high-performance
computing; manycore architectures; multicore;
multiprocessing systems; parallel architectures;
performance evaluation; PFLOPS; Pipelines; Ports
(Computers); power aware computing; power dissipation;
power-efficient execution; quadrillion floating-point
operations-per-second; Radio frequency; register-file
activity; Registers; single-thread performance
improvement; Software; ultra-high-performance
computers; VLIW; VLIW-CMP",
keywords-plus = "REGISTER LIFETIME; ARCHITECTURE",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Jacob:2016:CVC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kleanthous:2016:TML,
author = "Marios Kleanthous and Yiannakis Sazeides and Emre Ozer
and Chrysostomos Nicopoulos and Panagiota Nikolaou and
Zacharias Hadjilambrou",
title = "Toward Multi-Layer Holistic Evaluation of System
Designs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "58--61",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2445877",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The common practice for quantifying the benefit(s) of
design-time architectural choices of server processors
is often limited to the chip- or server-level. This
quantification process invariably entails the use of
salient metrics, such as performance, power, and
reliability, which capture-in a tangible manner-a
designs overall ramifications. This paper argues for
the necessity of a more holistic evaluation approach,
which considers metrics across multiple integration
levels (chip, server and datacenter). In order to
facilitate said comprehensive evaluation, we utilize an
aggregate metric, e.g. the Total Cost of Ownership
(TCO), to harness the complexly of comparing multiple
metrics at multiple levels. We motivate our proposition
for holistic evaluation with a case study that compares
a 2D processor to a 3D processor at various design
integration levels. We show that while a 2D processor
is clearly the best choice at the processor level, the
conclusion is reversed at the data-center level, where
the 3D processor becomes a better choice. This result
emanates mainly from the performance benefits of
processor-DRAM 3D integration, and the ability to
amortize (at the datacenter-level) the higher 3D
per-server cost and lower reliability by requiring
fewer 3D servers to match the same performance.",
acknowledgement = ack-nhfb,
affiliation = "Kleanthous, M (Reprint Author), Univ Cyprus, Dept Comp
Sci, Nicosia, Cyprus. Kleanthous, Marios; Sazeides,
Yiannakis; Nikolaou, Panagiota; Hadjilambrou,
Zacharias, Univ Cyprus, Dept Comp Sci, Nicosia, Cyprus.
Nicopoulos, Chrysostomos, Univ Cyprus, Dept Elect \&
Comp Engn, Nicosia, Cyprus. Ozer, Emre, ARM Ltd, Res,
Cambridge CB19NJ, England.",
author-email = "marios@kleanthous.info yanos@cs.ucy.ac.cy
emre.ozer@arm.com nicopoulos@ucy.ac.cy
nikolaou@cs.ucy.ac.cy zhadji01@cs.ucy.ac.cy",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Commission [612069 HARPA, 247779
EuroCloud]",
funding-text = "This work was supported by the European Commission FP7
projects ``Harnessing Performance Variability'' (No:
612069 HARPA) and ``Energy-conscious 3D Server-on-Chip
for Green Cloud Services'' (No: 247779 EuroCloud).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "2D processor; 3D processor; Chip; chip; Chip; chip;
Computational modeling; computer centres; data-center
level; Datacenter; datacenter; Datacenter; datacenter;
design integration levels; Design-Space Exploration;
design-space exploration; Design-Space Exploration;
design-space exploration; design-time architectural
choices; DRAM chips; Evaluation Metrics; evaluation
metrics; Evaluation Metrics; Holistic evaluation;
Holistic Evaluation; Holistic evaluation; Holistic
Evaluation; Holistic evaluation; integrated circuit
reliability; Measurement; microprocessor chips;
multilayer holistic evaluation; multiple integration
levels; performance evaluation; processor-DRAM 3D
integration; Program processors; ramifications;
Reliability; reliability; Reliability; Server; server;
Server; server processors; Servers; system designs;
System-on-chip; Three-dimensional displays",
keywords-plus = "PERFORMANCE",
number-of-cited-references = "23",
ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Kleanthous:2016:TML",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Daya:2016:THP,
author = "Bhavya K. Daya and Li-Shiuan Peh and Anantha P.
Chandrakasan",
title = "Towards High-Performance Bufferless {NoCs} with
{SCEPTER}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "62--65",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2428699",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In the many-core era, the network on-chip (NoC) is
playing a larger role in meeting performance, area and
power goals, as router buffers contribute greatly to
NoC area and power usage. Proposals have advocated
bufferless NoCs, however a performance wall has been
reached such that high throughput performance has not
been extracted. We present SCEPTER, a high-performance
bufferless mesh NoC that sets up single-cycle virtual
express paths dynamically across the chip, allowing
deflected packets to go through non-minimal paths with
no latency penalty. For a 64 node network, we
demonstrate an average 62 percent reduction in latency
and an average $ 1.3 \times $ higher throughput over a
baseline bufferless NoC for synthetic traffic patterns;
with comparable performance to a single-cycle multihop
buffered mesh network with six flit buffers, per input
port, in each router.",
acknowledgement = ack-nhfb,
affiliation = "Daya, BK (Reprint Author), MIT, Dept EECS, 77
Massachusetts Ave, Cambridge, MA 02139 USA. Daya,
Bhavya K.; Peh, Li-Shiuan; Chandrakasan, Anantha P.,
MIT, Dept EECS, 77 Massachusetts Ave, Cambridge, MA
02139 USA.",
author-email = "bdaya@mit.edu peh@csail.mit.edu anantha@mtl.mit.edu",
da = "2019-06-20",
doc-delivery-number = "DY1XQ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "64 node network; bufferless router; bypassing;
Computer architecture; deflection routing;
high-performance bufferless mesh NoC; latency
reduction; multiprocessor interconnection;
Multiprocessor interconnection; multiprocessor
interconnection; multiprocessor interconnection
networks; Multiprocessor interconnection, on-chip mesh
networks, bufferless router, deflection routing,
bypassing; network routing; network-on-chip; nonminimal
paths; on-chip mesh networks; performance evaluation;
Pipelines; Ports (Computers); power aware computing;
power usage; Resource management; router buffers;
Routing; SCEPTER; single-cycle express path traversal
for efficient routing; single-cycle virtual express
paths; Switches; synthetic traffic patterns;
Throughput",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Daya:2016:THP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2016:IICc,
author = "Anonymous",
title = "Introducing {IEEE Collabratec}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "66--66",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2578800",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:IICd,
author = "Anonymous",
title = "Introducing {IEEE Collabratec}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "66--66",
month = jan # "\slash " # jun,
year = "2016",
DOI = "https://doi.org/10.1109/LCA.2016.2578800",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 08:36:31 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "IEEE Collabratec is a new, integrated online community
where IEEE members, researchers, authors, and
technology professionals with similar fields of
interest can network and collaborate, as well as create
and manage content. Featuring a suite of powerful
online networking and collaboration tools, IEEE
Collabratec allows you to connect according to
geographic location, technical interests, or career
pursuits. You can also create and share a professional
identity that showcases key accomplishments and
participate in groups focused around mutual interests,
actively learning from and contributing to
knowledgeable communities. All in one place! Learn
about IEEE Collabratec at ieeecollabratec.org.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:ENM,
author = "Anonymous",
title = "Experience the Newest and Most Advanced Thinking in
Big Data Analytics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "67--67",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2581058",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement, IEEE.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:ICS,
author = "Anonymous",
title = "{{\booktitle{IEEE Cyber Security}}}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "68--68",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2581078",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement, IEEE.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:TCa,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "C1--C1",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2578758",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the table of contents for this issue of the
publication.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:Ca,
author = "Anonymous",
title = "Cover",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "C2--C2",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2578759",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:Cb,
author = "Anonymous",
title = "Cover",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "C2--C2",
month = jan # "\slash " # jun,
year = "2016",
DOI = "https://doi.org/10.1109/LCA.2016.2578759",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 08:36:31 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of board members, committee
members, editors, and society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:Cc,
author = "Anonymous",
title = "Cover",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "C3--C3",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2578760",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:Cd,
author = "Anonymous",
title = "Cover",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "C3--C3",
month = jan # "\slash " # jun,
year = "2016",
DOI = "https://doi.org/10.1109/LCA.2016.2578760",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 08:36:31 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "These instructions give guidelines for preparing
papers for this publication. Presents information for
authors publishing in this journal.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:BC,
author = "Anonymous",
title = "[{Back} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "1",
pages = "C4--C4",
month = jan # "\slash " # jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2578761",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the table of contents for this issue of the
publication.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Liang:2016:CGR,
author = "Shuang Liang and Shouyi Yin and Leibo Liu and Yike Guo
and Shaojun Wei",
title = "A Coarse-Grained Reconfigurable Architecture for
Compute-Intensive {MapReduce} Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "69--72",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2458318",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Large-scale workloads often show parallelism of
different levels. which offers acceleration potential
for clusters and parallel processors. Although
processors such as GPGPUs and FPGAs show good
performance of speedup, there is still vacancy for a
low power, high efficiency and dynamically
reconfigurable one, and coarse-grained reconfigurable
architecture (CGRA) seems to be one possible choice. In
this paper, we introduce how we use our CGRA fabric
Chameleon to realize a dynamically reconfigurable
acceleration to MapReduce-based (MR-based)
applications. A FPGA-shell-CGRA-core (FSCC)
architecture is designed for the acceleration
PCI-Express board, and a programming model with
compilation flow for CGRA is presented. With the
supports above, a small evaluation cluster with Hadoop
framework is set up, and experiments on
compute-intensive applications show that the
programming process is significantly simplified, with
an 30-60 x speedup offered under low power.",
acknowledgement = ack-nhfb,
affiliation = "Yin, SY (Reprint Author), Tsinghua Univ, Inst
Microelect, Beijing 100084, Peoples R China. Liang,
Shuang; Yin, Shouyi; Liu, Leibo; Wei, Shaojun, Tsinghua
Univ, Inst Microelect, Beijing 100084, Peoples R China.
Guo, Yike, Imperial Coll London, Dept Comp, London,
England.",
author-email = "s-liang11@mails.tsinghua.edu.cn yinsy@tsinghua.edu.cn
liulb@mail.tsinghua.edu.cn fiascoo@gmail.com
wsj@tsinghua.edu.cn",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Nature Science foundation of China
[61274131]; International S\&T Cooperation Project of
China [2012DFA11170]; Tsinghua Indigenous Research
Project [20111080997]; China National High Technologies
Research Program [2012-AA012701]",
funding-text = "This work was supported by the National Nature Science
foundation of China (No. 61274131), the International
S\&T Cooperation Project of China (No. 2012DFA11170),
the Tsinghua Indigenous Research Project (No.
20111080997) and the China National High Technologies
Research Program (No. 2012-AA012701). S. Yin is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator; Accelerators; Computer architecture;
Field programmable gate arrays; Hardware; MapReduce;
Programming; Reconfigurable architectures;
Reconfigurable computing; Servers",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Liang:2016:CGR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lai:2016:QMD,
author = "Bo-Cheng Charles Lai and Luis Garrido Platero and
Hsien-Kai Kuo",
title = "A Quantitative Method to Data Reuse Patterns of {SIMT}
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "73--76",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2491279",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Understanding data reuse patterns of a computing
system is crucial to effective design optimization. The
emerging Single Instruction Multiple Threads (SIMT)
processor adopts a programming model that is
fundamentally disparate from conventional scalar
processors. There is a lack of analytical approaches to
quantify the data reuse of SIMT applications. This
paper presents a quantitative method to study the data
reuse inherent to SIMT applications. A metric, Data
Reuse Degree, is defined to measure the amount of
reused data between memory references, and associate
each data reuse degree to a temporal distance
representing the virtual time of the execution process.
The experiments are performed on an abstracted SIMT
processor that considers the programming model and
runtime specifics. The experiments illustrate diverse
data reuse patterns of SIMT applications and explore
the impacts of architectural limitations.",
acknowledgement = ack-nhfb,
affiliation = "Lai, BCC (Reprint Author), Natl Chiao Tung Univ, Dept
Elect Engn, Hsinchu 300, Taiwan. Lai, Bo-Cheng Charles,
Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300,
Taiwan. Platero, Luis Garrido, Barcelona Super Comp
Ctr, Barcelona, Spain. Kuo, Hsien-Kai, MediaTek Inc,
Hsinchu, Taiwan.",
author-email = "bclai@mail.nctu.edu.tw luis.garrido.platero@gmail.com
hsienkai.kuo@gmail.com",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "MOST [104-2221-E-009-079]",
funding-text = "This project was supported by MOST grant
104-2221-E-009-079.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural limitations; cache memory; Cache memory;
computing system; data analysis; data reuse degree;
data reuse patterns; design optimization; execution
process; Graphics processing units; Instruction sets;
Measurement; Memory management; multi-threading;
Parallel architectures; Parallel architectures, cache
memory, parallel processing; parallel processing;
Parallel processing; programming model; scalar
processors; SIMT applications; SIMT processors;
single-instruction multiple-threads processors; virtual
time",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lai:2016:QMD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Cakmakci:2016:CPG,
author = "Yaman {\c{C}}akmak{\c{c}}i and Will Toms and Javier
Navaridas and Mikel Lujan",
title = "Cyclic Power-Gating as an Alternative to Voltage and
Frequency Scaling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "77--80",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2478784",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Dynamic Voltage and Frequency Scaling is the most
commonly used power management technique in modern
processors. However, the ability of an individual chip
to operate under reduced supply voltage can no longer
be predetermined at the design stage and may even
change over time. This paper presents Cyclic
Power-Gating (CPG), a novel power management strategy
where the power consumption of a core can be finely
controlled without scaling the supply voltage. CPG
builds on state-retentive power-gating which allows the
power supply to a core to be switched off and on again
at high speed (tens of clock cycles) with minimal
disruption to running programs. The power-gating is
cyclic, by altering the ratio of time spent powered-on
and off in each power-gating period the effective
operating frequency and power consumption of a core can
be controlled. The overheads in delay and power
consumption of CPG for an out-of-order core in a 14 nm
technology are accurately modelled and compared to the
performance and power consumption of Voltage/Frequency
pairs in the same technology. The proposed power gating
method reduces average power consumption by 4 percent
over voltage and frequency scaling with only a 2
percent degradation in performance.",
acknowledgement = ack-nhfb,
affiliation = "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), Univ
Manchester, Sch Comp Sci, Manchester M13 9PL, Lancs,
England. {\c{C}}akmak{\c{c}}i, Yaman; Toms, Will;
Navaridas, Javier; Lujan, Mikel, Univ Manchester, Sch
Comp Sci, Manchester M13 9PL, Lancs, England.",
author-email = "cakmakcy@cs.man.ac.uk tomsw@cs.man.ac.uk
javier.navaridas@manchester.ac.uk
mikel.lujan@manchester.ac.uk",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "EPSRC [DOME EP/J016330/1, PAMELA
EP/K008730/1]; Royal Society University Research
Fellowship; Engineering and Physical Sciences Research
Council [EP/K008730/1, EP/J016330/1]",
funding-text = "This work was supported by EPSRC grants DOME
EP/J016330/1 and PAMELA EP/K008730/1. Mike Lujan an is
funded by a Royal Society University Research
Fellowship. The authors thank Timothy Jones for his
comments on the draft version of this paper.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Capacitance; Computer architecture;
CPG; cyclic power-gating; Energy efficiency; frequency
scaling; leakage reduction; power aware computing;
power consumption; Power demand; Power efficient
design; power management; power management strategy;
state-retentive power-gating; Voltage measurement;
voltage scaling",
number-of-cited-references = "12",
oa = "Bronze",
ORCID-numbers = "Navaridas Palma, Javier/0000-0001-7272-6597",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Cakmakci:2016:CPG",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tomusk:2016:DDG,
author = "Erik Tomusk and Christophe Dubach and Michael
O'Boyle",
title = "Diversity: a Design Goal for Heterogeneous
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "81--84",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2499739",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A growing number of processors have CPU cores that
implement the same instruction set architecture (ISA)
using different microarchitectures. The underlying
motivation for single-ISA heterogeneity is that a
diverse set of cores can enable runtime flexibility.
Modern processors are subject to strict power budgets,
and heterogeneity provides the runtime scheduler with
more latitude to decide the level of performance a
program should have based on the amount of power that
can be spent. We argue that selecting a diverse set of
heterogeneous cores to enable flexible operation at
runtime is a non-trivial problem due to diversity in
program behavior. We further show that common
evaluation methods lead to false conclusions about
diversity. Finally, we suggest the KS statistical test
as an evaluation metric. The KS test is the first step
toward a heterogeneous design methodology that
optimizes for runtime flexibility.",
acknowledgement = ack-nhfb,
affiliation = "Tomusk, E (Reprint Author), Univ Edinburgh, Edinburgh,
Midlothian, Scotland. Tomusk, Erik; Dubach, Christophe;
O'Boyle, Michael, Univ Edinburgh, Edinburgh,
Midlothian, Scotland.",
author-email = "e.tomusk@ed.ac.uk christophe.dubach@ed.ac.uk
mob@inf.ed.ac.uk",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; core
selection; CPU cores; design goal; Diversity;
flexibility; heterogeneity; heterogeneous cores;
heterogeneous design methodology; heterogeneous
processors; instruction set architecture; instruction
sets; integrated circuit design; ISA;
Kolmogorov-Smirnov test; KS statistical test;
Measurement; metrics; Microarchitecture;
microarchitectures; microprocessor chips; power aware
computing; Program processors; Runtime; runtime
flexibility; runtime scheduler; statistical testing",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Tomusk:2016:DDG",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hashemi:2016:EEB,
author = "Milad Hashemi and Debbie Marr and Doug Carmean and
Yale N. Patt",
title = "Efficient Execution of Bursty Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "85--88",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2456013",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The performance of user-facing applications is
critical to client platforms. Many of these
applications are event-driven and exhibit ``bursty''
behavior: the application is generally idle but
generates bursts of activity in response to human
interaction. We study one example of a bursty
application, web-browsers, and produce two important
insights: (1) Activity bursts contain false
parallelism, bringing many cores out of a deep sleep to
inefficiently render a single webpage, and (2) these
bursts are highly compute driven, and thus scale nearly
linearly with frequency. We show average performance
gains/energy reductions of 14\%/17\% respectively on
real hardware by statically moving threads from
multiple cores to a single core. We then propose
dynamic hardware driven thread migration and scheduling
enhancements that detect these bursts, leading to
further benefits.",
acknowledgement = ack-nhfb,
affiliation = "Hashemi, M (Reprint Author), Univ Texas Austin, Elect
\& Comp Engn, Austin, TX 78701 USA. Hashemi, Milad;
Patt, Yale N., Univ Texas Austin, Elect \& Comp Engn,
Austin, TX 78701 USA. Marr, Debbie, Intel Corp, Intel
Labs, Portland, OR USA. Carmean, Doug, Microsoft,
Microsoft Res, Seattle, WA USA.",
author-email = "miladh@hps.utexas.edu debbie.marr@intel.com
dcarmean@microsoft.com patt@hps.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Corporation; Cockrell Foundation; HPS
Research Group",
funding-text = "The authors thank Intel Corporation and the Cockrell
Foundation for their continued generous financial
support of the HPS Research Group.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Browsers; bursty applications; dynamic hardware;
Energy; energy reductions; Hardware; human computer
interaction; human interaction; Instruction sets;
Internet; Loading; multi-threading; Multicore
processing; multiple cores; multiprocessing systems;
online front-ends; Operating systems; performance;
performance evaluation; performance gains; power aware
computing; thread migration; thread scheduling;
Web-browsers; Webpage; webpages; webpages, thread
scheduling",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hashemi:2016:EEB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kannan:2016:EAP,
author = "Sudarsun Kannan and Moinudin Qureshi and Ada
Gavrilovska and Karsten Schwan",
title = "Energy Aware Persistence: Reducing the Energy
Overheads of Persistent Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "89--92",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2472410",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Next generation byte addressable nonvolatile memory
(NVM) technologies like PCM are attractive for end-user
devices as they offer memory scalability as well as
fast persistent storage. In such environments, NVM's
limitations of slow writes and high write energy are
magnified for applications that need atomic,
consistent, isolated and durable (ACID) updates. This
is because, for satisfying correctness (ACI),
application state must be frequently flushed from all
intermediate buffers, including processor cache, and to
support durability (D) guarantees, that state must be
logged. This increases NVM access and more importantly
results in additional CPU instructions. This paper
proposes Energy Aware Persistence (EAP). To develop
EAP, we first show that the energy related overheads
for maintaining durability are significant. We then
propose energy-efficient durability principles that
mitigate those costs, an example being flexible logging
that switch between performance and energy-efficient
modes and a memory management technique that trades
capacity for energy. Finally, we propose relaxed
durability (ACI-RD) mechanism used under critical low
energy conditions that do not affect correctness. The
initial results for several realistic applications and
benchmark show up to 2x reduction in CPU and NVM energy
usage relative to a traditional ACID-based
persistence.",
acknowledgement = ack-nhfb,
affiliation = "Kannan, S (Reprint Author), Georgia Inst Technol,
Atlanta, GA 30332 USA. Kannan, Sudarsun; Qureshi,
Moinudin; Gavrilovska, Ada; Schwan, Karsten, Georgia
Inst Technol, Atlanta, GA 30332 USA.",
author-email = "sudarsun@gatech.edu moin@ece.gatech.edu
ada@cc.gatech.edu schwan@cc.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACI-RD mechanism; ACID; ACID updates; ACID-based
persistence; atomic-consistent-isolated-durable
updates; Benchmark testing; cache storage; CPU energy
usage; CPU instructions; EAP; end-user devices; energy
aware persistence; Energy management; energy overhead
reduction; energy overheads; energy-efficient
durability principles; energy-efficient modes;
heap-based persistence; logging; memory management;
microprocessor chips; next generation byte addressable
nonvolatile memory; next generation byte addressable
NVM; Nonvolatile memory; NVM; NVM access; NVM energy
usage; Optimization; performance evaluation; persistent
memory; power aware computing; processor cache; Random
access memory; random-access storage; Resource
management; storage management",
keywords-plus = "PHASE-CHANGE MEMORY",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kannan:2016:EAP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Valero:2016:ELD,
author = "Alejandro Valero and Negar Miralaei and Salvador Petit
and Julio Sahuquillo and Timothy M. Jones",
title = "Enhancing the {L1} Data Cache Design to Mitigate
{HCI}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "93--96",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2460736",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Over the lifetime of a microprocessor, the Hot Carrier
Injection (HCI) phenomenon degrades the threshold
voltage, which causes slower transistor switching and
eventually results in timing violations and faulty
operation. This effect appears when the memory cell
contents flip from logic `0' to `1' and vice versa. In
caches, the majority of cell flips are concentrated
into only a few of the total memory cells that make up
each data word. In addition, other researchers have
noted that zero is the most commonly-stored data value
in a cache, and have taken advantage of this behavior
to propose data compression and power reduction
techniques. Contrary to these works, we use this
information to extend the lifetime of the caches by
introducing two microarchitectural techniques that
spread and reduce the number of flips across the
first-level (L1) data cache cells. Experimental results
show that, compared to the conventional approach, the
proposed mechanisms reduce the highest cell flip peak
up to 65.8 percent, whereas the threshold voltage
degradation savings range from 32.0 to 79.9 percent
depending on the application.",
acknowledgement = ack-nhfb,
affiliation = "Valero, A (Reprint Author), Univ Politecn Valencia,
Dept Comp Engn, Valencia, Spain. Valero, Alejandro;
Petit, Salvador; Sahuquillo, Julio, Univ Politecn
Valencia, Dept Comp Engn, Valencia, Spain. Miralaei,
Negar; Jones, Timothy M., Univ Cambridge, Comp Lab,
Cambridge, England.",
author-email = "alvabre@gap.upv.es negar.miralaei@cl.cam.ac.uk
spetit@disca.upv.es jsahuqui@disca.upv.es
timothy.jones@cl.cam.ac.uk",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministerio de Economia y
Competitividad (MINECO); FEDER funds
[TIN2012-38341-C04-01]; Intel Early Career Faculty
Honor Program Award; HiPEAC Collaboration Grant-FP7
HiPEAC Network of Excellence [287759]; Engineering and
Physical Sciences Research Council (EPSRC)
[EP/K026399/1, EP/J016284/1]; Engineering and Physical
Sciences Research Council [EP/J016284/1,
EP/K026399/1]",
funding-text = "This work has been supported by the Spanish Ministerio
de Economia y Competitividad (MINECO), by FEDER funds
through Grant TIN2012-38341-C04-01, by the Intel Early
Career Faculty Honor Program Award, by a HiPEAC
Collaboration Grant funded by the FP7 HiPEAC Network of
Excellence under grant agreement 287759, and by the
Engineering and Physical Sciences Research Council
(EPSRC) through Grants EP/K026399/1 and EP/J016284/1.
Additional data related to this publication are
available in the data repository at
https://www.repository.cam.ac.uk/handle/1810/249006.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cache memories; Cache memory; cache storage; cell flip
peaks; cell flips; commonly-stored data value; data
compression; Degradation; faulty operation; first-level
data cache cells; HCI mitigation; Hot carrier effects;
Hot Carrier Injection; hot carrier injection; Hot
Carrier Injection; hot carriers; Human computer
interaction; L1 data cache design; memory architecture;
memory cells; microarchitectural techniques;
microprocessor chips; microprocessor lifetime;
Microprocessors; power aware computing; power
reduction; Program processors; threshold voltage
degradation; transistor switching; Voltage
measurement",
number-of-cited-references = "10",
oa = "Green Accepted, Green Published",
ORCID-numbers = "Valero, Alejandro/0000-0002-0824-5833",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Valero:2016:ELD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sen:2016:GFM,
author = "Rathijit Sen and David A. Wood",
title = "{GPGPU} Footprint Models to Estimate per-Core Power",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "97--100",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2456909",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We explore the problem of how to easily estimate the
per-core power distribution of GPGPUs from the total
power of all cores. We show that the dynamic energy
consumption of a core for a given kernel, represented
by its work footprint, is approximately proportional to
the total time taken by all work units executing on
that core, and the static power, represented by its
core footprint, is proportional to the time that the
core has assigned work. Footprints can be easily
tracked using two hardware counters per GPU core. We
also show how per-core power estimates can be used to
compute power-performance pareto frontiers that
identify opportunities for saving power and energy in
cases of non-uniform work distribution by exploiting
per-core DVFS support for GPGPUs.",
acknowledgement = ack-nhfb,
affiliation = "Sen, R (Reprint Author), Univ Wisconsin, Dept Comp
Sci, 1210 W Dayton St, Madison, WI 53706 USA. Sen,
Rathijit; Wood, David A., Univ Wisconsin, Dept Comp
Sci, 1210 W Dayton St, Madison, WI 53706 USA.",
author-email = "rathijit@cs.wisc.edu david@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation (NSF)
[CCF-1218323, CNS-1302260]",
funding-text = "The authors thank Srilatha Manne, Indrani Paul, and
Wei Huang for discussions about per-core DVFS support
in GPUs and Mark Hill, Jason Power, anonymous
reviewers, and the Associate Editor for helpful review
comments. This work was supported in part with US
National Science Foundation (NSF) grants CCF-1218323
and CNS-1302260. The views expressed herein are not
necessarily those of the NSF. Wood has significant
financial interests in AMD and Google.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Computational modeling; DVFS;
dynamic energy consumption; energy consumption;
footprint; GPGPU; GPGPU footprint models; GPGPU
per-core power distribution; Graphics processing units;
graphics processing units; Mathematical model; Pareto
analysis; pareto frontier; Pareto optimization;
per-core DVFS support; per-core power estimation;
power; power aware computing; Power distribution;
power-performance Pareto frontiers; Predictive models;
static power",
keywords-plus = "PERFORMANCE",
number-of-cited-references = "12",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Sen:2016:GFM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jung:2016:LPS,
author = "Daejin Jung and Sheng Li and Jung Ho Ahn",
title = "Large Pages on Steroids: Small Ideas to Accelerate Big
Memory Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2495103",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Utilizing small (e.g., 4 KB) pages incurs frequent TLB
misses on modern big memory applications, substantially
degrading the performance of the system. Large (e.g., 1
GB) pages or direct segments can alleviate this penalty
due to page table walks, but at the same time such a
strategy exposes the organizational and operational
details of modern DRAM-based memory systems to
applications. Row-buffer conflicts caused by accesses
heading to the same DRAM bank but different rows from
multiple threads are regarded as the main culprits
behind the very large gaps between peak and achieved
main memory throughput, but hardware-based approaches
in memory controllers have achieved only limited
success whereas existing proposals that change memory
allocators cannot be applied to large pages or direct
segments. In this paper, we propose a set of
application-level techniques to improve the effective
main memory bandwidth. The techniques stem from the two
key observations that (1) each thread of an application
exclusively accesses certain datasets for a short or
long period of time, and (2) superfluous memory reads
originating from a cache's write allocation policy can
be avoided if scatters during the data shuffling pass
through intermediate cache-friendly buffers.
Experiments with a contemporary x86 server show that
combining large pages with the proposed address
linearization, bank coloring, and write streaming
techniques improves the performance of the three big
memory applications of high-throughput key-value store,
fast-Fourier transform, and radix sort by 37.6, 22.9,
and 68.1 percent, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Jung, D (Reprint Author), Seoul Natl Univ, Dept
Transdisciplinary Studies, Seoul, South Korea. Jung,
Daejin; Ahn, Jung Ho, Seoul Natl Univ, Dept
Transdisciplinary Studies, Seoul, South Korea. Li,
Sheng, Intel Labs, Santa Clara, CA USA. Ahn, Jung Ho,
Seoul Natl Univ, Big Data Inst, Seoul, South Korea.",
author-email = "haidj@snu.ac.kr sheng.r.li@intel.com gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea -
Korea government [NRF-2014R1A2A1A11052936,
NRF-2012M3A9D1054622]",
funding-text = "The authors thank Jongwook Chung and Jaeyoon Choi on
their contributions to application writing and
experiments. This work was partially supported by the
National Research Foundation of Korea grant funded by
the Korea government (NRF-2014R1A2A1A11052936 and
NRF-2012M3A9D1054622). Jung Ho Ahn is also with Big
Data Institute, Seoul National University.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address linearization; application-level techniques;
Bandwidth; bank coloring; big memory applications;
cache storage; cache write allocation policy;
cache-friendly buffers; data shuffling; DRAM bank; DRAM
chips; DRAM-based memory; fast-Fourier transform;
high-throughput key-value store; Instruction sets;
large pages; memory allocators; memory bandwidth;
memory controllers; Memory management; memory
throughput; multi-threading; multiple threads;
Performance gain; Physical-to-DRAM address mapping;
radix sort; Random access memory; row-buffer conflicts;
Servers; superfluous memory reads; write streaming",
number-of-cited-references = "14",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Jung:2016:LPS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Verdu:2016:PSA,
author = "Javier Verdu and Alex Pajuelo",
title = "Performance Scalability Analysis of {JavaScript}
Applications with {Web} Workers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2494585",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Web applications are getting closer to the performance
of native applications taking advantage of new
standard-based technologies. The recent HTML5 standard
includes, among others, the Web Workers API that allows
executing JavaScript applications on multiple threads,
or workers. However, the internals of the browser's
JavaScript virtual machine does not expose direct
relation between workers and running threads in the
browser and the utilization of logical cores in the
processor. As a result, developers do not know how
performance actually scales on different environments
and therefore what is the optimal number of workers on
parallel JavaScript codes. This paper presents the
first performance scalability analysis of parallel web
apps with multiple workers. We focus on two case
studies representative of different worker execution
models. Our analyses show performance scaling on
different parallel processor microarchitectures and on
three major web browsers in the market. Besides, we
study the impact of co-running applications on the web
app performance. The results provide insights for
future approaches to automatically find out the optimal
number of workers that provide the best tradeoff
between performance and resource usage to preserve
system responsiveness and user experience, especially
on environments with unexpected changes on system
workload.",
acknowledgement = ack-nhfb,
affiliation = "Verdu, J (Reprint Author), BarcelonaTECH UPC, Dept
Comp Architecture, Barcelona, Spain. Verdu, Javier;
Pajuelo, Alex, BarcelonaTECH UPC, Dept Comp
Architecture, Barcelona, Spain.",
author-email = "jverdu@ac.upc.edu mpajuelo@ac.upc.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Economy and
Competitiveness (MINECO) [TIN2012-34557]",
funding-text = "This work has been supported by the Spanish Ministry
of Economy and Competitiveness (MINECO) under contract
TIN2012-34557.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application program interfaces; Benchmark testing;
Browsers; Computer architecture; HTML5; HTML5 standard;
hypermedia markup languages; Internet; Java;
javascript; JavaScript applications; Message systems;
Microarchitecture; multithreading; Multithreading;
multithreading; online front-ends; parallel processing;
parallel processor microarchitectures; parallel Web
apps; parallelism; performance scalability analysis;
resource usage; Scalability; standard-based
technologies; system responsiveness preservation; user
experience; Web applications; web apps; Web browsers;
web workers; Web workers API; worker execution models",
number-of-cited-references = "12",
oa = "Green Published",
ORCID-numbers = "Pajuelo, Alex/0000-0002-5510-6860 Verdu Mula,
Javier/0000-0003-4485-2419",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Verdu:2016:PSA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Delimitrou:2016:SID,
author = "Christina Delimitrou and Christos Kozyrakis",
title = "Security Implications of Data Mining in Cloud
Scheduling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "109--112",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2461215",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cloud providers host an increasing number of popular
applications, on the premise of resource flexibility
and cost efficiency. Most of these systems expose
virtualized resources of different types and sizes. As
instances share the same physical host to increase
utilization, they contend on hardware resources, e.g.,
last-level cache, making them vulnerable to
side-channel attacks from co-scheduled applications. In
this work we show that using data mining techniques can
help an adversarial user of the cloud determine the
nature and characteristics of co-scheduled applications
and negatively impact their performance through
targeted contention injections. We design Bolt, a
simple runtime that extracts the sensitivity of
co-scheduled applications to various types of
interference and uses this signal to determine the type
of these applications by applying a set of data mining
techniques. We validate the accuracy of Bolt on a
39-server cluster. Bolt correctly identifies the type
and characteristics of 81 percent out of 108 victim
applications, and constructs specialized contention
signals that degrade their performance. We also use
Bolt to find the most commonly-run applications on EC2.
We hope that underlining such security vulnerabilities
in modern cloud facilities will encourage cloud
providers to introduce stronger resource isolation
primitives in their systems.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Stanford Univ, Dept
Elect Engn, Stanford, CA 94305 USA. Delimitrou,
Christina; Kozyrakis, Christos, Stanford Univ, Dept
Elect Engn, Stanford, CA 94305 USA.",
author-email = "cdel@stanford.edu kozyraki@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "39-server cluster; application studies resulting in
better multiple-processor systems; Bolt; Cloud
computing; cloud computing; cloud facilities; cloud
providers; co-scheduled applications; Computer crime;
cost efficiency; cryptography; data mining; Data
mining; Degradation; Interference; resource allocation;
resource flexibility; resource isolation primitives;
scheduling and task partitioning; security and privacy
protection; security vulnerabilities; Servers;
side-channel attacks; specialized contention signals;
Super (very large) computers; virtualized resources",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Delimitrou:2016:SID",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2016:SMF,
author = "Zhenning Wang and Jun Yang and Rami Melhem and Bruce
Childers and Youtao Zhang and Minyi Guo",
title = "Simultaneous Multikernel: Fine-Grained Sharing of
{GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "113--116",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2477405",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Studies show that non-graphics programs can be less
optimized for the GPU hardware, leading to significant
resource under-utilization. Sharing the GPU among
multiple programs can effectively improve utilization,
which is particularly attractive to systems (e.g.,
cloud computing) where many applications require access
to the GPU. However, current GPUs lack proper
architecture features to support sharing. Initial
attempts are very preliminary in that they either
provide only static sharing, which requires
recompilation or code transformation, or they do not
effectively improve GPU resource utilization. We
propose Simultaneous Multikernel (SMK), a fine-grained
dynamic sharing mechanism, that fully utilizes
resources within a streaming multiprocessor by
exploiting heterogeneity of different kernels. We
extend the GPU hardware to support SMK, and propose
several resource allocation strategies to improve
system throughput while maintaining fairness. Our
evaluation of 45 shared workloads shows that SMK
improves GPU throughput by 34 percent over non-shared
execution and 10 percent over a state-of-the-art
design.",
acknowledgement = ack-nhfb,
affiliation = "Wang, ZN (Reprint Author), Shanghai Jiao Tong Univ,
Dept Comp Sci, Shanghai, Peoples R China. Wang,
Zhenning; Guo, Minyi, Shanghai Jiao Tong Univ, Dept
Comp Sci, Shanghai, Peoples R China. Yang, Jun, Univ
Pittsburgh, Elect \& Comp Engn Dept, Pittsburgh, PA
15260 USA. Melhem, Rami; Childers, Bruce; Zhang,
Youtao, Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA
15260 USA.",
author-email = "znwang@sjtu.edu.cn juy9@pitt.edu melhem@cs.pitt.edu
childers@cs.pitt.edu zhangyt@cs.pitt.edu
guo-my@cs.sjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Basic Research 973 Program of
China [2015CB352403]; National Natural Science
Foundation of China (NSFC) [61261160502, 61272099]; CSC
scholarship; US National Science Foundation (NSF)
[CNS-1012070, CNS-1305220, CCF-1422331]",
funding-text = "This work is supported in part by the National Basic
Research 973 Program of China (No. 2015CB352403), the
National Natural Science Foundation of China (NSFC)
(Nos. 61261160502, 61272099), the CSC scholarship, US
National Science Foundation (NSF) grants CNS-1012070,
CNS-1305220, and CCF-1422331.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Context switch; fine-grained dynamic sharing
mechanism; GPU; GPU hardware; GPU resource utilization
improvement; graphics processing units; Graphics
processing units; multiprocessing programs;
multiprocessor streaming; multitasking; Multitasking;
multitasking; nongraphic programs; resource allocation;
Resource management; resource under-utilization; SMK;
static sharing; Switches; Throughput",
number-of-cited-references = "17",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Wang:2016:SMF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhang:2016:SIW,
author = "Chulian Zhang and Hamed Tabkhi and Gunar Schirner",
title = "Studying Inter-Warp Divergence Aware Execution on
{GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "117--120",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2478778",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This letter quantitatively studies the benefits of
inter-warp divergence aware execution on GPUs. To that
end, the letter first proposes a novel approach to
quantify the inter-warp divergence by measuring the
temporal similarity in execution progress of concurrent
warps, which we call Warp Progression Similarity (WPS).
Based on the WPS metric, this letter proposes a
WPS-aware Scheduler (WPSaS) to optimize GPU throughput.
The aim is to manage inter-warp divergence to hide
memory access latency and minimize resource conflicts
and temporal under-utilization in compute units
allowing GPUs to achieve their peak throughput. Our
results demonstrate that WPSaS improves throughput by
10 percent with a pronounced reduction in resource
conflicts and temporal under-utilization.",
acknowledgement = ack-nhfb,
affiliation = "Zhang, CL (Reprint Author), Northeastern Univ, Dept
Elect \& Comp Engn, Boston, MA 02115 USA. Zhang,
Chulian; Tabkhi, Hamed; Schirner, Gunar, Northeastern
Univ, Dept Elect \& Comp Engn, Boston, MA 02115 USA.",
author-email = "zhang.chul@husky.neu.edu tabkhi@ece.neu.edu
schirner@ece.neu.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [1319501]",
funding-text = "This material is based upon work supported by the
National Science Foundation under Award No. 1319501.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; concurrent
warps; GPU scheduler; GPU throughput optimization;
Graphics processing units; graphics processing units;
Histograms; Inter-warp divergence; interwarp divergence
aware execution; interwarp divergence management;
Measurement; memory access latency hiding; Processor
scheduling; resource allocation; resource conflict
minimization; scheduling; temporal similarity
measurement; temporal underutilization; Throughput;
warp progression similarity; warp progression
similarity (WPS); WPS metric; WPS-aware scheduler;
WPSaS",
number-of-cited-references = "8",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Zhang:2016:SIW",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tavakkol:2016:TTB,
author = "Arash Tavakkol and Pooyan Mehrvarzy and Hamid
Sarbazi-Azad",
title = "{TBM}: Twin Block Management Policy to Enhance the
Utilization of Plane-Level Parallelism in {SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "121--124",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2461162",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The internal architecture of a SSD provides channel-,
chip-, die- and plane-level parallelism levels, to
concurrently perform multiple data accesses and
compensate for the performance gap between a single
flash chip and host interface. Although a good striping
strategy can effectively exploit the first three
levels, parallel I/O accesses at plane-level can be
performed only for operations of the same types and
page addresses. In this work, we propose the Twin Block
Management (TBM) policy that symmetrically conducts
usage and recycling of the flash block addresses on the
planes of a die, thus enhancing the utilization of
plane-level parallelism for reads, writes and erases.
Evaluation results show that TBM improves IOPS and
response time by up to 73 and 42 percent,
respectively.",
acknowledgement = ack-nhfb,
affiliation = "Tavakkol, A (Reprint Author), Sharif Univ Technol,
Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol,
Arash; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept
Comp Engn, HPCAN Lab, Tehran, Iran. Mehrvarzy, Pooyan;
Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
Comp Sci, Tehran, Iran.",
author-email = "tavakkol@ce.sharif.edu p.mehrvarzy@ipm.ir
azad@ipm.ir",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "channel-level parallelism level; chip-level
parallelism level; die-level parallelism level; flash
block; flash chip; flash memories; Flash memory;
garbage collection; host interface; IOPS; memory
architecture; multiple data accesses; parallel
processing; Parallel processing; performance
evaluation; plane-level parallelism; plane-level
parallelism level; Recycling; Resource management;
response time; Solid state circuits; solid-state drive;
SSD internal architecture; TBM; Time factors; twin
block management",
number-of-cited-references = "11",
ORCID-numbers = "Tavakkol, Arash/0000-0003-3859-1259",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Tavakkol:2016:TTB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jacob:2016:PPT,
author = "Bruce Jacob",
title = "The 2 {PetaFLOP}, 3 Petabyte, 9 {TB/s}, 90 {kW}
Cabinet: a System Architecture for Exascale and Big
Data",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "125--128",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2451652",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present a system architecture that uses
high-efficiency processors as opposed to
high-performance processors, NAND flash as
byte-addressable main memory, and high-speed DRAM as a
cache front-end for the flash. The main memory system
is interconnected and presents a unified global address
space to the client microprocessors. A single cabinet
contains 2,550 nodes, networked in a highly redundant
modified Moore graph that yields a bisection bandwidth
of 9.1 TB/s and a worst-case latency of four hops from
any node to any other. At a per-cabinet level, the
system supports a minimum of 2.6 petabytes of main
memory, dissipates 90 kW, and achieves 2.2 PetaFLOPS.
The system architecture provides several features
desirable in today's large-scale systems, including a
global shared physical address space (and optional
support for a global shared virtual space as well), the
ability to partition the physical space unequally among
clients as in a unified cache architecture (e.g., so as
to support multiple VMs in a datacenter), pairwise
system-wide sequential consistency on user-specified
address sets, built-in checkpointing via journaled
non-volatile main memory, memory cost-per-bit
approaching that of NAND flash, and memory performance
approaching that of pure DRAM.",
acknowledgement = ack-nhfb,
affiliation = "Jacob, B (Reprint Author), Univ Maryland, Elect \&
Comp Engn, College Pk, MD 20742 USA. Jacob, Bruce, Univ
Maryland, Elect \& Comp Engn, College Pk, MD 20742
USA.",
author-email = "blj@umd.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Big Data; Big data; Big Data; bisection
bandwidth; built-in checkpointing; byte-addressable
main memory; cache storage; checkpointing; DRAM chips;
exascale computing; extremely large; extremely large,
high radix network topologies; flash memories; High
performance computing; high-efficiency processors
high-performance processors; High-performance
computing; high-radix network topologies; high-speed
DRAM; journaled main memory; memory architecture;
Memory management; memory performance; microprocessor
chips; microprocessors; NAND flash; Network topology;
nonvolatile main memory; pairwise system-wide
sequential consistency; parallel architectures;
PetaFLOP; Ports (Computers); Program processors; Random
access memory; redundant modified Moore graph; system
architecture; user-specified address sets",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Jacob:2016:PPT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xiao:2016:TAC,
author = "He Xiao and Wen Yueh and Saibal Mukhopadhyay and
Sudhakar Yalamanchili",
title = "Thermally Adaptive Cache Access Mechanisms for {3D}
Many-Core Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "129--132",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2495125",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A compelling confluence of technology and application
trends in which the cost, execution time, and energy of
applications are being dominated by the memory system
is driving the industry to 3D packages for future
microarchitectures. However, these packages result in
high heat fluxes and increased thermal coupling
challenging current thermal solutions. Conventional
design approaches utilize design margins that
correspond to worst case temperatures and process
corners leading to a significant impact on system level
performance. This paper advocates a design approach
based on microarchitecture adaptation to device-level
temperature-dependent delay variations to realize
average case performance that is superior to which can
be achieved by using worst case design margins. We
demonstrate this approach with adaptation principles
for the last level cache (LLC) in a 3D many-core
architecture. We propose and evaluate two adaptation
mechanisms. In the first case, the access time to the
LLC from the L1 tracks the LLC's temperature-delay
variations. In the second case, the processor DVFS
state tracks the LLC temperature as a negative
feedback. Compared to a worst case design baseline, the
full system simulation results show that both
approaches increase the IPC by over 20 percent, and
improve the energy efficiency by up to 3 percent.",
acknowledgement = ack-nhfb,
affiliation = "Xiao, H (Reprint Author), Georgia Inst Technol, Sch
Elect \& Comp Engn, Atlanta, GA 30332 USA. Xiao, He;
Yueh, Wen; Mukhopadhyay, Saibal; Yalamanchili,
Sudhakar, Georgia Inst Technol, Sch Elect \& Comp Engn,
Atlanta, GA 30332 USA.",
author-email = "hxiao@gatech.edu wyueh3@gatech.edu
saibal.mukhopadhyay@ece.gatech.edu
sudha.yalamanchili@ece.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Semiconductor Research Corporation under
SRC [2318.001]; National Science Foundation
[CNS-0855110]",
funding-text = "This research is supported and sponsored by the
Semiconductor Research Corporation under SRC task
2318.001, and the National Science Foundation under
grant CNS-0855110.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D IC; 3D IC, SRAM cache, adaptive architecture,
performance gain, energy efficiency; 3D many-core
architectures; Adaptation models; adaptive
architecture; Cache memory; cache storage; Computer
architecture; device-level temperature-dependent delay
variations; energy efficiency; integrated circuit
design; Integrated circuit modeling; last level cache;
LLC temperature; memory architecture;
Microarchitecture; microarchitecture adaptation;
microarchitectures; multiprocessing systems;
performance evaluation; performance gain; power aware
computing; processor DVFS state; Random access memory;
SRAM cache; system level performance; thermal coupling
challenging current thermal solutions; thermally
adaptive cache access mechanisms; Three-dimensional
displays",
number-of-cited-references = "13",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Xiao:2016:TAC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hu:2016:TDM,
author = "Qi Hu and Peng Liu and Michael C. Huang",
title = "Threads and Data Mapping: Affinity Analysis for
Traffic Reduction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "133--136",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2451172",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Modern processors spend significant amount of time and
energy moving data. With the increase in core count,
the relative importance of such latency and energy
expenditure will only increase with time. Inter-core
communication traffic when executing a multithreaded
application is one such source of latency and energy
expenditure. This traffic is influenced by the mapping
of threads and data onto multicore systems. This paper
investigates the impact of threads and data mapping on
traffic in a chip-multiprocessor, and exploits the
potential for traffic reduction through threads and
data mapping. Based on the analysis and estimation of
the lowest traffic, we propose a threads and data
mapping mechanism to approach the lowest traffic. The
mapping takes both the correlation among threads and
the affinity of data with individual threads into
account, and results in significant traffic reduction
and energy savings.",
acknowledgement = ack-nhfb,
affiliation = "Liu, P (Reprint Author), Zhejiang Univ, Coll Informat
Sci \& Elect Engn, Hangzhou 310027, Peoples R China.
Hu, Qi; Liu, Peng, Zhejiang Univ, Coll Informat Sci \&
Elect Engn, Hangzhou 310027, Peoples R China. Huang,
Michael C., Univ Rochester, Dept Elect \& Comp Engn,
601 Elmwood Ave, Rochester, NY 14627 USA.",
author-email = "huqi\_isee@zju.edu.cn liupeng@zju.edu.cn
michael.huang@rochester.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSFC [61028004]; US National Science
Foundation (NSF) [1217662, 1255729]; Open Project
Program of the State Key Laboratory of Mathematical
Engineering and Advanced Computing [2014A08, 2015A09]",
funding-text = "This work was supported by NSFC under grant 61028004,
and also in part by US National Science Foundation
(NSF) under grants 1217662 and 1255729, and the Open
Project Program of the State Key Laboratory of
Mathematical Engineering and Advanced Computing under
grants 2014A08 and 2015A09. P. Liu is the corresponding
author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "affinity analysis; chip-multiprocessor; Correlation;
data mapping; energy conservation; energy savings;
Instruction sets; intercore communication traffic;
Mapping; memory; Message systems; microprocessor chips;
modern processors; multi-threading; multicore;
Multicore processing; multicore systems;
multiprocessing systems; multithreaded application;
network-on-chip; Network-on-chip; network-on-chip;
Statistical analysis; thread mapping; traffic; traffic
reduction",
keywords-plus = "NETWORKS; CACHES; CHIP",
number-of-cited-references = "11",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hu:2016:TDM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2016:TCb,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "C1--C1",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2628298",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:Ce,
author = "Anonymous",
title = "Cover",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "C2--C2",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2628299",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:Cf,
author = "Anonymous",
title = "Cover",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "C3--C3",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2628301",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2016:TCBa,
author = "Anonymous",
title = "Table of contents [back cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "15",
number = "2",
pages = "C4--C4",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2628302",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Beckmann:2017:CCM,
author = "Nathan Beckmann and Daniel Sanchez",
title = "Cache Calculus: Modeling Caches through Differential
Equations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "1--5",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2512873",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Caches are critical to performance, yet their behavior
is hard to understand and model. In particular, prior
work does not provide closed-form solutions of cache
performance, i.e., simple expressions for the miss rate
of a specific access pattern. Existing cache models
instead use numerical methods that, unlike closed-form
solutions, are computationally expensive and yield
limited insight. We present cache calculus, a technique
that models cache behavior as a system of ordinary
differential equations, letting standard calculus
techniques find simple and accurate solutions of cache
performance for common access patterns.",
acknowledgement = ack-nhfb,
affiliation = "Beckmann, N (Reprint Author), MIT CSAIL, Cambridge, MA
02139 USA. Beckmann, Nathan; Sanchez, Daniel, MIT
CSAIL, Cambridge, MA 02139 USA.",
author-email = "beckmann@csail.mit.edu sanchez@csail.mit.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-1318384]; Qatar Computing Research
Institute",
funding-text = "This work was supported in part by NSF grant
CCF-1318384 and a grant from the Qatar Computing
Research Institute.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arrays; cache behavior models; cache calculus; cache
memory; cache storage; closed-form solutions;
Closed-form solutions; closed-form solutions;
Computational modeling; Computer architecture; computer
architecture; Computer architecture; differential
equations; Differential equations; differential
equations; mathematical model; Mathematical model; miss
rate; Numerical models; ordinary differential
equations",
number-of-cited-references = "8",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Beckmann:2017:CCM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2017:IIC,
author = "Anonymous",
title = "2016 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 15",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "1--6",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2653771",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhan:2017:CCS,
author = "Xin Zhan and Reza Azimi and Svilen Kanev and David
Brooks and Sherief Reda",
title = "{CARB}: a {C}-State Power Management Arbiter for
Latency-Critical Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "6--9",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2537802",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Latency-critical workloads in datacenters have tight
response time requirements to meet service-level
agreements (SLAs). Sleep states (c-states) enable
servers to reduce their power consumption during idle
times; however entering and exiting c-states is not
instantaneous, leading to increased transaction
latency. In this paper we propose a c-state arbitration
technique, CARB, that minimizes response time, while
simultaneously realizing the power savings that could
be achieved from enabling c-states. CARB adapts to
incoming request rates and processing times and
activates the smallest number of cores for processing
the current load. CARB reshapes the distribution of
c-states and minimizes the latency cost of sleep by
avoiding going into deep sleeps too often. We quantify
the improvements from CARB with memcached running on an
8-core Haswell-based server.",
acknowledgement = ack-nhfb,
affiliation = "Zhan, X (Reprint Author), Brown Univ, Providence, RI
02906 USA. Zhan, Xin; Azimi, Reza; Reda, Sherief, Brown
Univ, Providence, RI 02906 USA. Kanev, Svilen; Brooks,
David, Harvard Univ, Cambridge, MA 02138 USA.",
author-email = "xin\_zhan@brown.edu reza\_azimi@brown.edu
skanev@eecs.harvard.edu dbrooks@eecs.harvard.edu
sherief\_reda@brown.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1305148, 1438958]",
funding-text = "The authors would like to thank the anonymous
reviewers for their comments. The research of X. Zhan,
R. Azimi, and S. Reda was supported by NSF under Grants
1305148 and 1438958.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "c-state; c-state arbitration technique; c-state
distribution; c-state power management arbiter; cache
storage; CARB; computer centres; contracts;
datacenters; Delays; energy-efficient; feedback
controller; Haswell-based server; idle times; latency
cost minimization; Latency-critical workloads;
latency-critical workloads; memcached; Monitoring;
Optimization; power aware computing; power consumption;
Power demand; power savings; processing times; request
rates; response time minimization; Servers;
service-level agreements; SLA; sleep states; Time
factors; workload consolidation",
number-of-cited-references = "10",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Zhan:2017:CCS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jeon:2017:CCA,
author = "Dong-Ik Jeon and Ki-Seok Chung",
title = "{CasHMC}: a Cycle-Accurate Simulator for Hybrid Memory
Cube",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "10--13",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2600601",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "3D-stacked DRAM has been actively studied to overcome
the limits of conventional DRAM. The Hybrid Memory Cube
(HMC) is a type of 3D-stacked DRAM that has drawn great
attention because of its usability for server systems
and processing-in-memory (PIM) architecture. Since HMC
is not directly stacked on the processor die where the
central processing units (CPUs) and graphic processing
units (GPUs) are integrated, HMC has to be linked to
other processor components through high speed serial
links. Therefore, the communication bandwidth and
latency should be carefully estimated to evaluate the
performance of HMC. However, most existing HMC
simulators employ only simple HMC modeling. In this
paper, we propose a cycle-accurate simulator for hybrid
memory cube called CasHMC. It provides a cycle-by-cycle
simulation of every module in an HMC and generates
analysis results including a bandwidth graph and
statistical data. Furthermore, CasHMC is implemented in
C++ as a single wrapped object that includes an HMC
controller, communication links, and HMC memory.
Instantiating this single wrapped object facilitates
simultaneous simulation in parallel with other
simulators that generate memory access patterns such as
a processor simulator or a memory trace generator.",
acknowledgement = ack-nhfb,
affiliation = "Jeon, DI (Reprint Author), Hanyang Univ, Dept Elect \&
Comp Engn, Seoul 04763, South Korea. Jeon, Dong-Ik;
Chung, Ki-Seok, Hanyang Univ, Dept Elect \& Comp Engn,
Seoul 04763, South Korea.",
author-email = "estwingz@naver.com kchung@hanyang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Institute for Information \& communications
Technology Promotion (IITP) --- Korea government (MSIP)
[R7119-16-1009]",
funding-text = "This work was supported by Institute for Information
\& communications Technology Promotion (IITP) grant
funded by the Korea government (MSIP) (R7119-16-1009,
Development of Intelligent Semiconductor Core
Technologies for IoT Devices based on Harvest Energy).
Ki-Seok Chung is the corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D-stacked DRAM; Analytical models; Bandwidth;
bandwidth graph; Benchmark testing; CasHMC; central
processing units; communication bandwidth;
communication links; Computational modeling; Computer
architecture; CPU; cycle-accurate simulator;
cycle-by-cycle simulation; C{\thorn}{\thorn}; DRAM
chips; GPU; graph theory; graphic processing units;
high-speed serial links; HMC controller; HMC memory;
HMC simulators; hybrid memory cube; latency; memory
access patterns; memory architecture; Memory control
and access; memory design; memory trace generator;
modeling of computer architecture; performance
evaluation; PIM architecture; processing-in-memory
architecture; processor simulator; Random access
memory; server systems; simulation; Simulation;
simulation; single-wrapped object instantiation;
statistical analysis; statistical data",
number-of-cited-references = "10",
ORCID-numbers = "CHUNG, KI-SEOK/0000-0002-2908-8443 Jeon,
Dong-Ik/0000-0002-8572-4184",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Jeon:2017:CCA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wu:2017:CSB,
author = "Hao Wu and Fangfei Liu and Ruby B. Lee",
title = "Cloud Server Benchmark Suite for Evaluating New
Hardware Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "14--17",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2597818",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Adding new hardware features to a cloud computing
server requires testing both the functionality and the
performance of the new hardware mechanisms. However,
commonly used cloud computing server workloads are not
well-represented by the SPEC integer and floating-point
benchmark and Parsec suites typically used by the
computer architecture community. Existing cloud
benchmark suites for scale-out or scale-up computing
are not representative of the most common cloud usage,
and are very difficult to run on a cycle-accurate
simulator that can accurately model new hardware, like
gem5. In this paper, we present PALMScloud, a suite of
cloud computing benchmarks for performance evaluation
of cloud servers, that is ready to run on the gem5
cycle-accurate simulator. We conduct a behavior
characterization and analysis of the benchmarks. We
hope that these cloud benchmarks, ready to run on a
dual-machine gem5 simulator or on real machines, can be
useful to other researchers interested in improving
hardware micro-architecture and cloud server
performance.",
acknowledgement = ack-nhfb,
affiliation = "Wu, H (Reprint Author), Princeton Univ, Princeton, NJ
08544 USA. Wu, Hao; Liu, Fangfei; Lee, Ruby B.,
Princeton Univ, Princeton, NJ 08544 USA.",
author-email = "haow.princeton@gmail.com fangfeil@princeton.edu
rblee@princeton.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "DHS/AFRL [FA8750-12-2-0295]; National
Science Foundation [CNS-1218817]",
funding-text = "This work was supported in part by DHS/AFRL
FA8750-12-2-0295 and US National Science Foundation
CNS-1218817.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "behavior characterization; Benchmark testing;
benchmarks; Cloud Computing; Cloud computing; cloud
computing; cloud computing benchmarks; cloud computing
server workloads; cloud server benchmark; cloud
servers; cloud usage; computer architecture; computer
architecture community; cycle accurate simulator; dual
machine gem5 simulator; floating-point benchmark; gem5;
Hardware; new hardware architectures; new hardware
mechanisms; Parsec; performance evaluation; Performance
evaluation; scale-out computing; scale-up computing;
simulation; SPEC integer",
number-of-cited-references = "8",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wu:2017:CSB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seyedzadeh:2017:CBT,
author = "Seyed Mohammad Seyedzadeh and Alex K. Jones and Rami
Melhem",
title = "Counter-Based Tree Structure for Row Hammering
Mitigation in {DRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "18--21",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2614497",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Scaling down DRAM technology degrades cell reliability
due to increased coupling between adjacent DRAM cells,
commonly referred to as crosstalk. Moreover, high
access frequency of certain cells (hot cells) may cause
data loss in neighboring cells in adjacent rows due to
crosstalk, which is known as row hammering. In this
work, the goal is to mitigate row hammering in DRAM
cells through a Counter-Based Tree (CBT) approach. This
approach uses a tree of counters to detect hot rows and
then refreshes neighboring cells. In contrast to
existing deterministic solutions, CBT utilizes fewer
counters that makes it practically feasible to be
implemented on-chip. Compared to existing probabilistic
approaches, CBT more precisely refreshes rows
vulnerable to row hammering based on their access
frequency. Experimental results on workloads from three
benchmark suites show that CBT can reduce the refresh
energy by more than 60 percent and nearly 70 percent in
comparison to leading probabilistic and deterministic
approaches, respectively. Furthermore, hardware
evaluation shows that CBT can be easily implemented
on-chip with only a nominal overhead.",
acknowledgement = ack-nhfb,
affiliation = "Seyedzadeh, SM (Reprint Author), Univ Pittsburgh, Dept
Comp Sci, Pittsburgh, PA 15260 USA. Seyedzadeh, Seyed
Mohammad; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci,
Pittsburgh, PA 15260 USA. Jones, Alex K., Univ
Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA
15260 USA.",
author-email = "seyedzadeh@cs.pitt.edu akjones@pitt.edu
melhem@cs.pitt.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-1064976]; SGMI grant from Samsung
electronics",
funding-text = "This work is supported by NSF grants CCF-1064976 and
an SGMI grant from Samsung electronics. We thank the
anonymous reviewers for their feedback.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "CBT; Computer architecture; counter-based tree
structure; crosstalk; Crosstalk; crosstalk; DRAM; DRAM
chips; dynamic random-access memory; Microprocessors;
Radiation detectors; Random access memory; reliability;
Reliability; reliability; row hammering mitigation;
System-on-chip",
keywords-plus = "REFRESH; MEMORY",
number-of-cited-references = "17",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Seyedzadeh:2017:CBT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Naghibijouybari:2017:CCG,
author = "Hoda Naghibijouybari and Nael Abu-Ghazaleh",
title = "Covert Channels on {GPGPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "22--25",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2590549",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "GPUs are increasingly used to accelerate the
performance of not only graphics workloads, but also
data intensive applications. In this paper, we explore
the feasibility of covert channels in General Purpose
Graphics Processing Units (GPGPUs). We consider the
possibility of two colluding malicious applications
using the GPGPU as a covert channel to communicate, in
the absence of a direct channel between them. Such a
situation may arise in cloud environments, or in
environments employing containment mechanisms such as
dynamic information flow tracking. We reverse engineer
the block placement algorithm to understand
co-residency of blocks from different applications on
the same Streaming Multiprocessor (SM) core, or on
different SMs concurrently. In either mode, we identify
the shared resources that may be used to create
contention. We demonstrate the bandwidth of two example
channels: one that uses the L1 constant memory cache to
enable communication on the same SM, and another that
uses the L2 constant memory caches to enable
communication between different SMs. We also examine
the possibility of increasing the bandwidth of the
channel by using the available parallelism on the GPU,
achieving a bandwidth of over 400 Kbps. This study
demonstrates that GPGPUs are a feasible medium for
covert communication.",
acknowledgement = ack-nhfb,
affiliation = "Naghibijouybari, H (Reprint Author), Univ Calif
Riverside, Dept Comp Sci \& Engn, Riverside, CA 92521
USA. Naghibijouybari, Hoda; Abu-Ghazaleh, Nael, Univ
Calif Riverside, Dept Comp Sci \& Engn, Riverside, CA
92521 USA.",
author-email = "hnagh001@ucr.edu naelag@ucr.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[CNS-1422401]",
funding-text = "This work is partially supported by US National
Science Foundation grant CNS-1422401.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Bandwidth; block placement algorithm;
cache storage; Computer architecture; covert channel;
general purpose graphics processing units; GPGPU;
Graphics processing units; graphics processing units;
Kernel; L1 constant memory cache; L2 constant memory
caches; malicious applications; multiprocessing
systems; Security; security of data; SM core; streaming
multiprocessor core; Trojan horses",
number-of-cited-references = "23",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Naghibijouybari:2017:CCG",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Song:2017:EPU,
author = "Wonjun Song and Hyung-Joon Jung and Jung Ho Ahn and
Jae W. Lee and John Kim",
title = "Evaluation of Performance Unfairness in {NUMA} System
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "26--29",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2602876",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "NUMA (Non-uniform memory access) system architectures
are commonly used in high-performance computing and
datacenters. Within each architecture, a
processor-interconnect is used for communication
between the different sockets and examples of such
interconnect include Intel QPI and AMD HyperTransport.
In this work, we explore the impact of the
processor-interconnect on overall performance-in
particular, we explore the impact on performance
fairness from the processor-interconnect arbitration.
It is well known that locally-fair arbitration does not
guarantee globally-fair bandwidth sharing as closer
nodes receive more bandwidth in a multi-hop network.
However, this paper is the first to demonstrate the
opposite can occur in a commodity NUMA servers where
remote nodes receive higher bandwidth (and perform
better). This problem occurs because router
micro-architectures for processor-interconnects
commonly employ external concentration. While accessing
remote memory can occur in any NUMA system, performance
unfairness (or performance variation) is more critical
in cloud computing and virtual machines with shared
resources. We demonstrate how this unfairness creates
significant performance variation when executing
workload on the Xen virtualization platform. We then
provide analysis using synthetic workloads to better
understand the source of unfairness.",
acknowledgement = ack-nhfb,
affiliation = "Song, W (Reprint Author), Korea Adv Inst Sci \&
Technol, Daejeon, South Korea. Song, Wonjun; Jung,
Hyung-Joon; Kim, John, Korea Adv Inst Sci \& Technol,
Daejeon, South Korea. Ahn, Jung Ho; Lee, Jae W., Seoul
Natl Univ, Seoul, South Korea.",
author-email = "iamwonjunsong@kaist.edu hans7taiji@kaist.edu
gajh@snu.ac.kr jaewlee@snu.ac.kr jjk12@kaist.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Google Faculty Research Award, National
Research Foundation of Korea [NRF-2013R1A2A2A01069132,
NRF-2014R1A2A1A11052936, NRF-2015M3C4A7065647]; MSIP
under the ITRC [IITP-2016-H8501-16-1005]",
funding-text = "This work was supported in part by Google Faculty
Research Award, National Research Foundation of Korea
(NRF-2013R1A2A2A01069132, NRF-2014R1A2A1A11052936, and
NRF-2015M3C4A7065647), and in part by MSIP under the
ITRC (IITP-2016-H8501-16-1005).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AMD HyperTransport; Bandwidth; cloud computing;
globally-fair bandwidth sharing; high-performance
computing; Intel QPI; locally-fair arbitration; memory
architecture; Micromechanical devices; multihop
network; Multiprocessor interconnection; nonuniform
memory access system architectures; NUMA; NUMA system
architecture; parallel processing; performance
unfairness evaluation; processor-interconnect;
processor-interconnect arbitration;
processor-interconnects; router microarchitectures;
Servers; shared resources; Sockets; System-on-chip;
unfairness; virtual machines; Virtual machining; Xen
virtualization platform",
number-of-cited-references = "8",
research-areas = "Computer Science",
researcherid-numbers = "Kim, John/C-1792-2011",
times-cited = "1",
unique-id = "Song:2017:EPU",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Verner:2017:EAL,
author = "Uri Verner and Avi Mendelson and Assaf Schuster",
title = "Extending {Amdahl's Law} for Multicores with Turbo
Boost",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "30--33",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2512982",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Rewriting sequential programs to make use of multiple
cores requires considerable effort. For many years,
Amdahl's law has served as a guideline to assess the
performance benefits of parallel programs over
sequential ones, but recent advances in multicore
design introduced variability in the performance of the
cores and motivated the reexamination of the underlying
model. This paper extends Amdahl's law for multicore
processors with built-in dynamic frequency scaling
mechanisms such as Intel's Turbo Boost. Using a model
that captures performance dependencies between cores,
we present tighter upper bounds for the speedup and
reduction in energy consumption of a parallel program
over a sequential one on a given multicore processor
and validate them on Haswell and Sandy Bridge Intel
CPUs. Previous studies have shown that from a processor
design perspective, Turbo Boost mitigates the speedup
limitations obtained under Amdahl's law by providing
higher performance for the same energy budget. However,
our new model and evaluation show that from a software
development perspective, Turbo Boost aggravates these
limitations by making parallelization of sequential
codes less profitable.",
acknowledgement = ack-nhfb,
affiliation = "Verner, U (Reprint Author), Technion, Dept Comp Sci,
Haifa, Israel. Verner, Uri; Mendelson, Avi; Schuster,
Assaf, Technion, Dept Comp Sci, Haifa, Israel.",
author-email = "uriv@cs.technion.ac.il avi.mendelson@cs.technion.ac.il
assaf@cs.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Amdahl law; Amdahl's law; Bridges; code
parallelization; Computational modeling; dynamic
frequency scaling mechanisms; energy consumption;
Energy consumption; energy consumption; Haswell;
multicore; multicore design; Multicore processing;
multicore processors; multiple cores; multiprocessing
systems; parallel programming; parallel programs;
Performance modeling; Power demand; Program processors;
Sandy Bridge Intel CPU; sequential code
parallelization; sequential program rewriting; software
development perspective; software engineering; Time
measurement; turbo boost; Turbo Boost; turbo boost",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Verner:2017:EAL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sasaki:2017:HTP,
author = "Hiroshi Sasaki and Fang-Hsiang Su and Teruo Tanimoto
and Simha Sethumadhavan",
title = "Heavy Tails in Program Structure",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "34--37",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2574350",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Designing and optimizing computer systems require deep
understanding of the underlying system behavior.
Historically many important observations that led to
the development of essential hardware and software
optimizations were driven by empirical observations
about program behavior. In this paper, we report an
interesting property of program structures by viewing
dynamic program execution as a changing network. By
analyzing the communication network created as a result
of dynamic program execution, we find that
communication patterns follow heavy-tailed
distributions. In other words, a few instructions have
consumers that are orders of magnitude larger than most
instructions in a program. Surprisingly, these
heavy-tailed distributions follow the iconic power law
previously seen in man-made and natural networks. We
provide empirical measurements based on the SPEC
CPU2006 benchmarks to validate our findings as well as
perform semantic analysis of the source code to reveal
the causes of such behavior.",
acknowledgement = ack-nhfb,
affiliation = "Sasaki, H (Reprint Author), Columbia Univ, Dept Comp
Sci, New York, NY 10027 USA. Sasaki, Hiroshi; Su,
Fang-Hsiang; Sethumadhavan, Simha, Columbia Univ, Dept
Comp Sci, New York, NY 10027 USA. Tanimoto, Teruo,
Kyushu Univ, Grad Sch Informat Sci \& Elect Engn,
Fukuoka 8190395, Japan.",
author-email = "sasaki@cs.columbia.edu mikefhsu@cs.columbia.edu
teruo.tanimoto@cpc.ait.kyushu-u.ac.jp
simha@cs.columbia.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "JSPS Postdoctoral Fellowships for Research
Abroad; US National Science Foundation [1302269];
Alfred P. Sloan Fellowship",
funding-text = "This work is sponsored in part by JSPS Postdoctoral
Fellowships for Research Abroad, US National Science
Foundation award number 1302269 and Alfred P. Sloan
Fellowship. This work was done while Teruo Tanimoto was
a visiting student at Columbia University.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Communication networks; computer
systems; Computers; dynamic program execution;
empirical studies; hardware optimization; heavy-tailed
distribution; Image edge detection; Optimization;
Program characterization; program diagnostics; program
structure; Registers; semantic analysis; Shape;
software optimization; SPEC CPU2006 benchmarks;
statistical distribution; statistical distributions;
system behavior",
number-of-cited-references = "9",
oa = "Bronze",
research-areas = "Computer Science",
researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019",
times-cited = "1",
unique-id = "Sasaki:2017:HTP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Feng:2017:HHC,
author = "Liang Feng and Hao Liang and Sharad Sinha and Wei
Zhang",
title = "{HeteroSim}: a Heterogeneous {CPU--FPGA} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "38--41",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2615617",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Heterogeneous Computing is a promising direction to
address the challenges of performance and power walls
in high-performance computing, where CPU-FPGA
architectures are particularly promising for
application acceleration. However, the development of
such architectures associated with optimal memory
hierarchies is challenging due to the absence of an
integrated simulator to support full system simulation
and architectural exploration. In this work, we present
HeteroSim, a full system simulator supporting x86
multi-cores integrated with an FPGA via bus connection.
It can support fast architectural exploration with
respect to number of cores, number of accelerated
kernels on FPGA, and different memory hierarchies
between CPU and FPGA. Various performance metrics are
returned for further performance analysis and
architectural configuration optimization.",
acknowledgement = ack-nhfb,
affiliation = "Feng, L (Reprint Author), Hong Kong Univ Sci \&
Technol, Kowloon, Hong Kong, Peoples R China. Feng,
Liang; Liang, Hao; Sinha, Sharad; Zhang, Wei, Hong Kong
Univ Sci \& Technol, Kowloon, Hong Kong, Peoples R
China.",
author-email = "lfengad@connect.ust.hk hliangac@connect.ust.hk
sharad\_sinha@ieee.org wei.zhang@ust.hk",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; architectural configuration
optimization; bus connection; Computational modeling;
Computer architecture; CPU-FPGA architectures; digital
simulation; Field programmable gate arrays; field
programmable gate arrays; FPGA; full system simulator;
Hardware design languages; heterogeneous computing;
heterogeneous CPU-FPGA simulator; heterogeneous system;
HeteroSim; high-performance computing; Kernel;
microprocessor chips; multiprocessing systems; optimal
memory hierarchies; parallel architectures; performance
analysis; performance metrics; Registers; Simulator;
x86 multicores",
number-of-cited-references = "11",
ORCID-numbers = "SINHA, SHARAD/0000-0002-4532-2017 SINHA,
SHARAD/0000-0002-4532-2017",
research-areas = "Computer Science",
researcherid-numbers = "SINHA, SHARAD/J-6775-2019 SINHA,
SHARAD/R-2575-2017",
times-cited = "1",
unique-id = "Feng:2017:HHC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhao:2017:LIC,
author = "Xia Zhao and Yuxi Liu and Almutaz Adileh and Lieven
Eeckhout",
title = "{LA-LLC}: Inter-Core Locality-Aware Last-Level Cache
to Exploit Many-to-Many Traffic in {GPGPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "42--45",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2611663",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The reply network is a severe performance bottleneck
in General Purpose Graphic Processing Units (GPGPUs),
as the communication path from memory controllers (MC)
to cores is often congested. In this paper, we find
that instead of relying on the congested communication
path between MCs and cores, the unused core-to-core
communication path can be leveraged to transfer data
blocks between cores. We propose the inter-core
Locality-Aware Last-Level Cache (LA-LLC), which
requires only few bits per cache block and enables a
core to fetch shared data from another core's private
cache instead of the LLC. Leveraging inter-core
communication, LA-LLC transforms few-to-many traffic to
many-to-many traffic, thereby mitigating the reply
network bottleneck. For a set of applications
exhibiting varying degrees of inter-core locality,
LA-LLC reduces memory access latency and increases
performance by 21.1 percent on average and up to 68
percent, with negligible hardware cost.",
acknowledgement = ack-nhfb,
affiliation = "Zhao, X (Reprint Author), Univ Ghent, Ghent, Belgium.
Zhao, Xia; Liu, Yuxi; Adileh, Almutaz; Eeckhout,
Lieven, Univ Ghent, Ghent, Belgium.",
author-email = "xia.zhao@ugent.be yuxi.liu@ugent.be
almutaz.adileh@ugent.be lieven.eeckhout@ugent.be",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Benchmark testing; cache storage; congested
communication path; core-to-core communication path;
few-to-many traffic; general purpose graphic processing
units; GPGPU; GPGPUs; Graphics processing units;
graphics processing units; inter-core locality;
intercore communication; intercore locality-aware
last-level cache; LA-LLC; LLC; many-to-many traffic;
memory access latency; memory controllers;
Multiprocessor interconnection; network-on-chip; NoC;
Ports (Computers); private cache; reply network; shared
data fetching; System recovery",
number-of-cited-references = "16",
oa = "Green Published",
ORCID-numbers = "Zhao, Xia/0000-0001-6479-9200",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zhao:2017:LIC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Boroumand:2017:LEC,
author = "Amirali Boroumand and Saugata Ghose and Minesh Patel
and Hasan Hassan and Brandon Lucia and Kevin Hsieh and
Krishna T. Malladi and Hongzhong Zheng and Onur Mutlu",
title = "{LazyPIM}: an Efficient Cache Coherence Mechanism for
Processing-in-Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "46--50",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2577557",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Processing-in-memory (PIM) architectures cannot use
traditional approaches to cache coherence due to the
high off-chip traffic consumed by coherence messages.
We propose LazyPIM, a new hardware cache coherence
mechanism designed specifically for PIM. LazyPIM uses a
combination of speculative cache coherence and
compressed coherence signatures to greatly reduce the
overhead of keeping PIM coherent with the processor. We
find that LazyPIM improves average performance across a
range of PIM applications by 49.1 percent over the best
prior approach, coming within 5.5 percent of an ideal
PIM mechanism.",
acknowledgement = ack-nhfb,
affiliation = "Boroumand, A (Reprint Author), Carnegie Mellon Univ,
Pittsburgh, PA 15123 USA. Boroumand, Amirali; Ghose,
Saugata; Patel, Minesh; Hassan, Hasan; Lucia, Brandon;
Hsieh, Kevin; Mutlu, Onur, Carnegie Mellon Univ,
Pittsburgh, PA 15123 USA. Hassan, Hasan, TOBB ETU
Sogutozu, TR-06560 Ankara, Turkey. Malladi, Krishna T.;
Zheng, Hongzhong, Samsung Semicond Inc, Milpitas, CA
95035 USA. Mutlu, Onur, ETH, Ramistr, CH-8092 Zurich,
Switzerland.",
author-email = "amirali@cmu.edu ghose@cmu.edu mineshp@andrew.cmu.edu
hhasan@etu.edu.tr blucia@andrew.cmu.edu
tsuwangh@andrew.cmu.edu k.tej@ssi.samsung.com
hz.zheng@ssi.samsung.com omuthu@gmail.com",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; cache coherence mechanism; cache storage;
Coherence; coherence messages; compressed coherence;
Computer architecture; Kernel; LazyPIM mechanism;
Message systems; PIM architecture;
processing-in-memory; Programming; Random access
memory; speculative cache coherence",
keywords-plus = "CONSISTENCY",
number-of-cited-references = "30",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "8",
unique-id = "Boroumand:2017:LEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gottscho:2017:MIM,
author = "Mark Gottscho and Mohammed Shoaib and Sriram Govindan
and Bikash Sharma and Di Wang and Puneet Gupta",
title = "Measuring the Impact of Memory Errors on Application
Performance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "51--55",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2599513",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory reliability is a key factor in the design of
warehouse-scale computers. Prior work has focused on
the performance overheads of memory fault-tolerance
schemes when errors do not occur at all, and when
detected but uncorrectable errors occur, which result
in machine downtime and loss of availability. We focus
on a common third scenario, namely, situations when
hard but correctable faults exist in memory; these may
cause an ``avalanche'' of errors to occur on affected
hardware. We expose how the hardware/software
mechanisms for managing and reporting memory errors can
cause severe performance degradation in systems
suffering from hardware faults. We inject faults in
DRAM on a real cloud server and quantify the
single-machine performance degradation for both batch
and interactive workloads. We observe that for SPEC
CPU2006 benchmarks, memory errors can slow down average
execution time by up to 2.5x. For an interactive
web-search workload, average query latency degrades by
up to 2.3x for a light traffic load, and up to an
extreme 3746x under peak load. Our analyses of the
memory error-reporting stack reveals architecture,
firmware, and software opportunities to improve
performance consistency by mitigating the worst-case
behavior on faulty hardware.",
acknowledgement = ack-nhfb,
affiliation = "Gottscho, M (Reprint Author), Univ Calif Los Angeles,
Dept Elect Engn, Los Angeles, CA 90095 USA. Gottscho,
Mark; Gupta, Puneet, Univ Calif Los Angeles, Dept Elect
Engn, Los Angeles, CA 90095 USA. Shoaib, Mohammed;
Wang, Di, Microsoft Res, Redmond, WA 98052 USA.
Govindan, Sriram; Sharma, Bikash, Microsoft, Redmond,
WA 98052 USA.",
author-email = "mgottscho@ucla.edu shoaib@microsoft.com
srgovin@microsoft.com bsharma@microsoft.com
wangdi@microsoft.com puneet@ee.ucla.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF Variability Expedition Grant
[CCF-1029030]",
funding-text = "This work was conducted jointly between Microsoft
Corporation and the NanoCAD Lab of the Electrical
Engineering Department at the University of California,
Los Angeles (UCLA). The authors thank Dr. Jie Liu of
Microsoft Research, and Dr. Badriddine Khessib and Dr.
Kushagra Vaid of Microsoft for supporting this work
while Mr. Gottscho was an intern at Microsoft Research
in 2015. Funding came partly from the NSF Variability
Expedition Grant No. CCF-1029030.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application performance; availability; cloud;
Degradation; DRAM; dynamic random-access storage;
error-handling; fault tolerant computing; Hardware;
hardware/software interface; hardware/software
mechanisms; Instruction sets; interactive web-search
workload; Main memory; memory errors; memory
fault-tolerance schemes; memory reliability;
performance consistency; Random access memory;
random-access storage; RAS; reliability; Reliability;
servers; Servers; servers; warehouse-scale computer
design",
keywords-plus = "VARIABILITY; RELIABILITY; SYSTEMS",
number-of-cited-references = "32",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Gottscho:2017:MIM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Adileh:2017:MPH,
author = "Almutaz Adileh and Stijn Eyerman and Aamer Jaleel and
Lieven Eeckhout",
title = "Mind The Power Holes: Sifting Operating Points in
Power-Limited Heterogeneous Multicores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "56--59",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2616339",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Heterogeneous chip multicore processors (HCMPs)
equipped with multiple voltage-frequency (V-F)
operating points provide a wide spectrum of
power-performance tradeoff opportunities. This work
targets the performance of HCMPs under a power cap. We
show that for any performance optimization technique to
work under power constraints, the default set of V-F
operating points in HCMPs must be first filtered based
on the application's power and performance
characteristics. Attempting to find operating points of
maximum performance by naively walking the default set
of operating points leads the application to
inefficient operating points which drain power without
significant performance benefit. We call these points
Power Holes (PH). Contrary to intuition, we show that
even using a power-performance curve of Pareto-optimal
operating points still degrades performance
significantly for the same reason. We propose
PH-Sifter, a fast and scalable technique that sifts the
default set of operating points and eliminates power
holes. We show significant performance improvement of
PH-Sifter compared to Pareto sifting for three use
cases: (i) maximizing performance for a single
application, (ii) maximizing system throughput for
multi-programmed workloads, and (iii) maximizing
performance of a system in which a fraction of the
power budget is reserved for a high-priority
application. Our results show performance improvements
of 13, 27, and 28 percent on average that reach up to
52, 91 percent, and 2.3x, respectively, for the three
use cases.",
acknowledgement = ack-nhfb,
affiliation = "Adileh, A (Reprint Author), Univ Ghent, B-9052 Ghent,
East Flanders, Belgium. Adileh, Almutaz; Eeckhout,
Lieven, Univ Ghent, B-9052 Ghent, East Flanders,
Belgium. Eyerman, Stijn, Intel Belgium, B-2550 Leuven,
Kontich, Belgium. Jaleel, Aamer, Nvidia Res, Boston, MA
01886 USA.",
author-email = "almutaz.adileh@ugent.be stijn.eyerman@elis.ugent.be
ajaleel@nvidia.com lieven.eeckhout@elis.ugent.be",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Research Council under the
European Community's Seventh Framework Programme
(FP7)/ERC grant [259295]",
funding-text = "We thank the anonymous reviewers for their thoughtful
feedback. This research is supported in part through
the European Research Council under the European
Community's Seventh Framework Programme
(FP7/2007-2013)/ERC grant agreement no. 259295. This
work was done while Stijn Eyerman was at Ghent
University.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "HCMP performance; heterogeneous chip multicore
processors; Heterogeneous multicores; high-priority
application; Indexes; Legged locomotion; Multicore
processing; multiple voltage-frequency operating
points; multiprocessing systems; multiprogramming;
optimal operating points; Optimization; Pareto
optimisation; Pareto-optimal operating points;
performance evaluation; performance maximization;
performance optimization; PH-Sifter; power aware
computing; Power Holes; power management; power-limited
processors; power-performance curve; power-performance
tradeoff opportunities; Program processors; Schedules;
system throughput maximization; Throughput; V-F
operating points",
keywords-plus = "PERFORMANCE; DVFS",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Adileh:2017:MPH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sasaki:2017:MPC,
author = "Hiroshi Sasaki and Alper Buyuktosunoglu and Augusto
Vega and Pradip Bose",
title = "Mitigating Power Contention: a Scheduling Based
Approach",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "60--63",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2572080",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Shared resource contention has been a major
performance issue for CMPs. In this paper, we tackle
the power contention problem in power constrained CMPs
by considering and treating power as a first-class
shared resource. Power contention occurs when multiple
processes compete for power, and leads to degraded
system performance. In order to solve this problem, we
develop a shared resource contention-aware scheduling
algorithm that mitigates the contention for power and
the shared memory subsystem at the same time. The
proposed scheduler improves system performance by
balancing the shared resource usage among scheduling
groups. Evaluation results across a variety of
multiprogrammed workloads show performance improvements
over a state-of-the-art scheduling policy which only
considers memory subsystem contention.",
acknowledgement = ack-nhfb,
affiliation = "Sasaki, H (Reprint Author), Columbia Univ, Dept Comp
Sci, New York, NY 10027 USA. Sasaki, Hiroshi, Columbia
Univ, Dept Comp Sci, New York, NY 10027 USA.
Buyuktosunoglu, Alper; Vega, Augusto; Bose, Pradip, IBM
TJ Watson Res Ctr, New York, NY 10598 USA.",
author-email = "sasaki@cs.columbia.edu alperb@us.ibm.com
ajvega@us.ibm.com pbose@us.ibm.com",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "JSPS Postdoctoral Fellowships for Research
Abroad; Defense Advanced Research Projects Agency
(DARPA), Microsystems Technology Office (MTO)
[HR0011-13-C-0022]",
funding-text = "This work is sponsored, in part, by JSPS Postdoctoral
Fellowships for Research Abroad, and Defense Advanced
Research Projects Agency (DARPA), Microsystems
Technology Office (MTO), under contract number
HR0011-13-C-0022. The views expressed are those of the
authors and do not reflect the official policy or
position of the Department of Defense or the U.S.
Government.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; chip multiprocessors;
energy-efficient systems; first-class shared resource;
Memory management; memory subsystem contention;
multi-core processors; multiprogrammed workloads;
performance evaluation; power aware computing; power
capping; power constrained CMP; Power contention; power
contention problem; Power demand; process scheduling;
processor scheduling; Processor scheduling; Random
access memory; resource allocation; Scheduling;
scheduling-based approach; shared memory systems;
shared resource contention-aware scheduling algorithm;
System performance",
keywords-plus = "PERFORMANCE",
number-of-cited-references = "15",
oa = "Bronze",
research-areas = "Computer Science",
researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019",
times-cited = "1",
unique-id = "Sasaki:2017:MPC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Marquez:2017:MCH,
author = "David Gonzalez Marquez and Adrian Cristal Kestelman
and Esteban Mocskos",
title = "{Mth}: Codesigned Hardware\slash Software Support for
Fine Grain Threads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "64--67",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2606383",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multi-core processors are ubiquitous in all market
segments from embedded to high performance computing,
but only few applications can efficiently utilize them.
Existing parallel frameworks aim to support
thread-level parallelism in applications, but the
imposed overhead prevents their usage for small problem
instances. This work presents Micro-threads (Mth) a
hardware-software proposal focused on a shared thread
management model enabling the use of parallel resources
in applications that have small chunks of parallel code
or small problem inputs by a combination of software
and hardware: delegation of the resource control to the
application, an improved mechanism to store and fill
processor's context, and an efficient synchronization
system. Four sample applications are used to test our
proposal: HSL filter (trivially parallel), FFT Radix2
(recursive algorithm), LU decomposition (barrier every
cycle) and Dantzig algorithm (graph based, matrix
manipulation). The results encourage the use of Mth and
could smooth the use of multiple cores for applications
that currently can not take advantage of the
proliferation of the available parallel resources in
each chip.",
acknowledgement = ack-nhfb,
affiliation = "Marquez, DG (Reprint Author), Univ Buenos Aires, Fac
Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
RA-1053 Buenos Aires, DF, Argentina. Marquez, David
Gonzalez; Mocskos, Esteban, Univ Buenos Aires, Fac
Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
RA-1053 Buenos Aires, DF, Argentina. Mocskos, Esteban,
CSC CONICET, C1425FQD, RA-2390 Buenos Aires, DF,
Argentina. Kestelman, Adrian Cristal, CSIC, IIIA,
Barcelona Supercomp Ctr, ES-08034 Barcelona, Spain.
Kestelman, Adrian Cristal, Univ Politecn Cataluna, Dept
Comp Architecture, ES-08034 Barcelona, Spain.",
author-email = "dmarquez@dc.uba.ar adrian.cristal@bsc.es
emocskos@dc.uba.ar",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Universidad de Buenos Aires [UBACyT
20020130200096BA]; CONICET [PIP 11220110100379]",
funding-text = "This work was partially funded by grants from
Universidad de Buenos Aires (UBACyT 20020130200096BA)
and CONICET (PIP 11220110100379). The authors thank
specially Osman Unsal for reading this article with
fruitful criticism.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "barrier every cycle; codesigned hardware-software
support; Dantzig algorithm; digital arithmetic;
embedded processors; fast Fourier transforms; FFT
Radix2 algorithm; fine grain threads; graph based
algorithm; graph theory; hardware-software codesign;
high performance computing; HSL filter; LU
decomposition; matrix decomposition; matrix
manipulation; Message systems; microthreads; Mirrors;
Mth hardware/software support; multi-threading;
multicore processing; multicore processors;
multithreading; Parallel architectures; parallel
architectures; Parallel architectures; parallel code;
parallel frameworks; Parallel processing; parallel
programming; parallel resources; Program processors;
Proposals; recursive algorithm; Registers; resource
control; shared memory systems; shared thread
management model; Synchronization; synchronization
system; thread-level parallelism support; trivially
parallel filter",
keywords-plus = "PARALLELISM",
number-of-cited-references = "11",
ORCID-numbers = "Mocskos, Esteban/0000-0002-6473-7672",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Marquez:2017:MCH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Morad:2017:ORO,
author = "Tomer Y. Morad and Gil Shomron and Mattan Erez and
Avinoam Kolodny and Uri C. Weiser",
title = "Optimizing Read-Once Data Flow in Big-Data
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "68--71",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2520927",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory hierarchies in modern computing systems work
well for workloads that exhibit temporal data locality.
Data that is accessed frequently is brought closer to
the computing cores, allowing faster access times,
higher bandwidth, and reduced transmission energy. Many
applications that work on big data, however, read data
only once. When running these applications on modern
computing systems, data that is not reused is
nevertheless transmitted and copied into all memory
hierarchy levels, leading to energy and bandwidth
waste. In this paper we evaluate workloads dealing with
read-once data and measure their energy consumption. We
then modify the workloads so that data that is known to
be used only once is transferred directly from storage
into the CPU's last level cache, effectively bypassing
DRAM and avoiding keeping unnecessary copies of the
data. Our measurements on a real system show savings of
up to 5 Watts in server power and up to 3.9 percent
reduction in server energy when 160 GB of read-once
data bypasses DRAM.",
acknowledgement = ack-nhfb,
affiliation = "Morad, TY (Reprint Author), Cornell Tech, Jacobs
Technion Cornell Inst, 111 8th Ave, New York, NY 10011
USA. Morad, Tomer Y.; Shomron, Gil; Kolodny, Avinoam;
Weiser, Uri C., Technion Israel Inst Technol, Dept
Elect Engn, IL-32000 Haifa, Israel. Morad, Tomer Y.,
Cornell Tech, Jacobs Technion Cornell Inst, 111 8th
Ave, New York, NY 10011 USA. Erez, Mattan, Univ Texas
Austin, Dept Elect \& Comp Engn, 201 E 24th St, C0803,
POB 6-248, Austin, TX 78712 USA.",
author-email = "tomerm@tx.technion.ac.il gilsho@tx.technion.ac.il
mattan.erez@utexas.edu kolodny@ee.technion.ac.il
uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Collaborative Research Institute for
Computational Intelligence (ICRI-CI)",
funding-text = "This research was supported by the Intel Collaborative
Research Institute for Computational Intelligence
(ICRI-CI).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; bandwidth wastage; Big Data; Big-Data
applications; cache storage; computing cores; CPU
last-level cache; data access time; data flow
computing; DRAM; energy consumption measure; Energy
efficiency; Energy measurement; energy wastage; memory
architecture; memory hierarchy levels; Memory
management; Performance evaluation; Prefetching; Random
access memory; read-once data flow optimization;
reduced transmission energy; server energy reduction;
Servers; temporal data locality",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Morad:2017:ORO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yasoubi:2017:PEA,
author = "Ali Yasoubi and Reza Hojabr and Mehdi Modarressi",
title = "Power-Efficient Accelerator Design for Neural Networks
Using Computation Reuse",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "72--75",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2521654",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Applications of neural networks in various fields of
research and technology have expanded widely in recent
years. In particular, applications with inherent
tolerance to accuracy loss, such as signal processing
and multimedia applications, are highly suited to the
approximation property of neural networks. This
approximation property has been exploited in many
existing neural network accelerators to trade-off
accuracy for power-efficiency and speed. In addition to
the power saving obtained by approximation, we observed
that a considerable amount of arithmetic operations in
neural networks are repetitive and can be eliminated to
further decrease power consumption. Given this
observation, we propose CORN, COmputation Reuse-aware
Neural network accelerator that allows neurons to share
their computation results, effectively eliminating the
power usage of redundant computations. We will show
that CORN lowers power consumption by 26 percent on
average over low-power neural network accelerators.",
acknowledgement = ack-nhfb,
affiliation = "Yasoubi, A (Reprint Author), Univ Tehran, Dept Elect
\& Comp Engn, Coll Engn, Tehran, Iran. Yasoubi, Ali;
Hojabr, Reza; Modarressi, Mehdi, Univ Tehran, Dept
Elect \& Comp Engn, Coll Engn, Tehran, Iran.",
author-email = "a.yosoubi@ut.ac.ir r.hojabr@ut.ac.ir
modarressi@uti.ac.ir",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "approximation; arithmetic operations; Biological
neural networks; Buffer storage; computation reuse;
computation reuse-aware neural network accelerator;
Computer architecture; CORN; energy conservation;
hardware accelerator; low-power neural network
accelerators; neural nets; Neural network; Neurons;
power aware computing; Power demand; power usage
elimination; power-efficiency; power-efficient
accelerator design; Redundancy; redundant
computations",
keywords-plus = "RECOGNITION",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Yasoubi:2017:PEA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Son:2017:SAS,
author = "Young Hoon Son and Hyunyoon Cho and Yuhwan Ro and Jae
W. Lee and Jung Ho Ahn",
title = "{SALAD}: Achieving Symmetric Access Latency with
Asymmetric {DRAM} Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "76--79",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2525760",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory access latency has significant impact on
application performance. Unfortunately, the random
access latency of DRAM has been scaling relatively
slowly, and often directly affects the critical path of
execution, especially for applications with
insufficient locality or memory-level parallelism. The
existing low-latency DRAM organizations either incur
significant area overhead or burden the software stack
with non-uniform access latency. This paper proposes
SALAD, a new DRAM device architecture that provides
symmetric access latency with asymmetric DRAM bank
organizations. Since local banks have lower data
transfer time due to their proximity to the I/O pads,
SALAD applies high aspect-ratio (i.e., low-latency)
mats only to remote banks to offset the difference in
data transfer time, thus providing uniformly low access
time (tAC) over the whole device. Our evaluation
demonstrates that SALAD improves the IPC by 13 percent
(10 percent) without any software modifications, while
incurring only 6 percent (3 percent) area overhead.",
acknowledgement = ack-nhfb,
affiliation = "Son, YH (Reprint Author), Seoul Natl Univ, Seoul,
South Korea. Son, Young Hoon; Cho, Hyunyoon; Ro,
Yuhwan; Ahn, Jung Ho, Seoul Natl Univ, Seoul, South
Korea. Lee, Jae W., Sungkyunkwan Univ, Seoul, South
Korea.",
author-email = "yhson96@snu.ac.kr sumk40@snu.ac.kr yuhwanro@snu.ac.kr
jaewlee@skku.edu gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea -
Korea government [NRF-2015M3C4A7065647]; ICT R\&D
program of MSIP/IITP [KI001810041244]",
funding-text = "This work was partially supported by the National
Research Foundation of Korea grant funded by the Korea
government (NRF-2015M3C4A7065647) and ICT R\&D program
of MSIP/IITP (KI001810041244).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "asymmetric bank organizations; asymmetric DRAM bank
organizations; Data transfer; data transfer time; DRAM;
DRAM chips; DRAM device architecture; I/O pads; memory
architecture; Memory management; microarchitecture;
Organizations; Parallel processing; Random access
memory; SALAD; Software; symmetric access latency with
asymmetric DRAM; uniformly low access time",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Son:2017:SAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Judd:2017:SBS,
author = "Patrick Judd and Jorge Albericio and Andreas
Moshovos",
title = "{Stripes}: Bit-Serial Deep Neural Network Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "80--83",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2597140",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The numerical representation precision required by the
computations performed by Deep Neural Networks (DNNs)
varies across networks and between layers of a same
network. This observation motivates a precision-based
approach to acceleration which takes into account both
the computational structure and the required numerical
precision representation. This work presents Stripes
(STR), a hardware accelerator that uses bit-serial
computations to improve energy efficiency and
performance. Experimental measurements over a set of
state-of-the-art DNNs for image classification show
that STR improves performance over a state-of-the-art
accelerator from 1.35x to 5.33x and by 2.24x on
average. STR's area and power overhead are estimated at
5 percent and 12 percent respectively. STR is 2.00x
more energy efficient than the baseline.",
acknowledgement = ack-nhfb,
affiliation = "Judd, P (Reprint Author), Univ Toronto, Edward S
Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S 3H7,
Canada. Judd, Patrick; Albericio, Jorge; Moshovos,
Andreas, Univ Toronto, Edward S Rogers Sr Dept Elect \&
Comp Engn, Toronto, ON M5S 3H7, Canada.",
author-email = "patrick.judd@mail.utoronto.ca jorge@ece.utoronto.ca
moshovos@eecg.toronto.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Artificial neural networks; bit-serial computations;
bit-serial deep neural network computing; convolution;
deep learning; deep neural networks; energy efficiency;
Graphics processing units; Hardware acceleration; image
classification; learning (artificial intelligence);
neural nets; Neurons; Nickel; numerical representation;
Parallel processing; precision-based approach; serial
computing; STR; Stripes; Three-dimensional displays",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Judd:2017:SBS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ravi:2017:TSM,
author = "Gokul Subramanian Ravi and Mikko Lipasti",
title = "Timing Speculation in Multi-Cycle Data Paths",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "1",
pages = "84--87",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2580501",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern processors set timing margins conservatively at
design time to support extreme variations in workload
and environment, in order to operate reliably and
produce expected outputs. Unfortunately, the
conservative guard bands set to achieve this
reliability are detrimental to processor performance
and energy efficiency. In this paper, we propose the
use of processors with internal transparent pipelines,
which allow data to flow between stages without
latching, to maximize timing speculation efficiency as
they are inherently suited to slack conservation. We
design a synchronous tracking mechanism which runs in
parallel with the multi-cycle data path to estimate the
accumulated slack across instructions/pipeline stages
and then appropriately clock synchronous boundaries
early to minimize wasted slack and achieve maximum
clock cycle savings. Preliminary evaluations atop the
CRIB processor show performance improvements of greater
than 10\% on average and as high as 30\% for an assumed
25\% slack per clock cycle.",
acknowledgement = ack-nhfb,
affiliation = "Ravi, GS (Reprint Author), Univ Wisconsin, Dept Elect
\& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA.
Ravi, Gokul Subramanian; Lipasti, Mikko, Univ
Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr,
Madison, WI 53706 USA.",
author-email = "gravi@wisc.edu mikko@engr.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "clock cycle savings; clocks; Clocks; CRIB; CRIB
processor; internal transparent pipelines;
microprocessor chips; multi-cycle datapath; multicycle
data paths; parallel processing; parallel synchronous
tracking mechanism; pipeline processing; Pipelines;
Program processors; Proposals; Registers; Reliability;
slack; Timing; Timing speculation; timing speculation",
number-of-cited-references = "8",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Ravi:2017:TSM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khan:2017:CMC,
author = "Samira Khan and Chris Wilkerson and Donghyuk Lee and
Alaa R. Alameldeen and Onur Mutlu",
title = "A Case for Memory Content-Based Detection and
Mitigation of Data-Dependent Failures in {DRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "88--93",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2624298",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DRAM cells in close proximity can fail depending on
the data content in neighboring cells. These failures
are called data-dependent failures. Detecting and
mitigating these failures online while the system is
running in the field enables optimizations that improve
reliability, latency, and energy efficiency of the
system. All these optimizations depend on accurately
detecting every possible data-dependent failure that
could occur with any content in DRAM. Unfortunately,
detecting all data-dependent failures requires the
knowledge of DRAM internals specific to each DRAM chip.
As internal DRAM architecture is not exposed to the
system, detecting data-dependent failures at the
system-level is a major challenge. Our goal in this
work is to decouple the detection and mitigation of
data-dependent failures from physical DRAM organization
such that it is possible to detect failures without
knowledge of DRAM internals. To this end, we propose
MEMCON, a memory content-based detection and mitigation
mechanism for data-dependent failures in DRAM. MEMCON
does not detect every possible data-dependent failure.
Instead, it detects and mitigates failures that occur
with the current content in memory while the programs
are running in the system. Using experimental data from
real machines, we demonstrate that MEMCON is an
effective and low-overhead system-level detection and
mitigation technique for data-dependent failures in
DRAM.",
acknowledgement = ack-nhfb,
affiliation = "Khan, S (Reprint Author), Univ Virginia,
Charlottesville, VA 22903 USA. Khan, Samira, Univ
Virginia, Charlottesville, VA 22903 USA. Wilkerson,
Chris; Alameldeen, Alaa R., Intel Labs, Santa Clara, CA
95054 USA. Lee, Donghyuk; Mutlu, Onur, Carnegie Mellon
Univ, Pittsburgh, PA 15213 USA. Mutlu, Onur, ETH,
CH-8092 Zurich, Switzerland.",
author-email = "samirakhan@virginia.edu chris.wilkerson@intel.com
donghyu1@cmu.edu alaa.r.alameldeen@intel.com
onur@cmu.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "ISTC-CC, an US National Science Foundation
[CCF-0953246]; US National Science Foundation
[CCF-1212962, CNS-1320531, CCF-1566483]",
funding-text = "We thank anonymous reviewers and SAFARI group members
for feedback. We acknowledge the support of Google,
Intel, Nvidia, Seagate, and Samsung. This research was
supported in part by the ISTC-CC, an US National
Science Foundation CAREER Award (CCF-0953246), and US
National Science Foundation grants (CCF-1212962,
CNS-1320531, and CCF-1566483).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Content management; data content; data dependent
failure; data-dependent failures; DRAM; DRAM cells;
DRAM chips; DRAM internals; DRAM, data dependent
failure, system-level testing; failure analysis;
Failure analysis; integrated circuit reliability;
Interference; low-overhead system-level detection
technique; low-overhead system-level migration
technique; MEMCON; memory content-based detection;
memory content-based migration; neighboring cells;
optimisation; physical DRAM organization; System-level
design; system-level testing; Testing",
keywords-plus = "NOISE",
number-of-cited-references = "42",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Khan:2017:CMC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mittal:2017:ARD,
author = "Sparsh Mittal and Jeffrey S. Vetter and Lei Jiang",
title = "Addressing Read-Disturbance Issue in {STT--RAM} by
Data Compression and Selective Duplication",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "94--98",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2645207",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In deep sub-micron region, spin transfer torque RAM
(STT-RAM) shows read-disturbance error (RDE) which
presents a crucial reliability challenge. We present
SHIELD, a technique to mitigate RDE in STT-RAM last
level caches (LLCs). SHIELD uses data compression to
reduce cache-write traffic and restore requirement.
Also, SHIELD keeps two copies of data blocks compressed
to less than half the block size and since several LLC
blocks are only accessed once, this approach avoids
several restore operations. SHIELD consumes smaller
energy than two previous RDE-mitigation techniques,
namely high-current restore required read (HCRR, also
called restore-after-read) and low-current long latency
read (LCLL) and even an ideal RDE-free STT-RAM cache.",
acknowledgement = ack-nhfb,
affiliation = "Mittal, S (Reprint Author), IIT Hyderabad, Sangareddy
502285, Telangana, India. Mittal, Sparsh, IIT
Hyderabad, Sangareddy 502285, Telangana, India. Vetter,
Jeffrey S., Oak Ridge Natl Lab, Oak Ridge, TN 37830
USA. Jiang, Lei, Indiana Univ, Bloomington, IN 47405
USA.",
author-email = "sparsh0mittal@gmail.com vetter@ornl.gov
jiang60@iu.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "U.S. Department of Energy, Office of
Science, Advanced Scientific Computing Research",
funding-text = "Support for this work was provided by the U.S.
Department of Energy, Office of Science, Advanced
Scientific Computing Research.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; data blocks; data compression; Data
compression; data compression; deep sub-micron region;
duplication; Encoding; Error analysis; Error correction
codes; HCRR; ideal RDE-free STT-RAM cache; integrated
circuit reliability; last level cache; last level
caches; LCLL; LLC; low-current long latency read;
Magnetic tunneling; Non-volatile memory; Nonvolatile
memory; Random access memory; random-access storage;
read disturbance error; read-disturbance error;
restore-after-read; selective duplication; SHIELD; spin
transfer torque RAM; STT-RAM; transfer torque RAM",
number-of-cited-references = "14",
ORCID-numbers = "Mittal, Sparsh/0000-0002-2908-993X",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Mittal:2017:ARD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Bakhshalipour:2017:ETD,
author = "Mohammad Bakhshalipour and Pejman Lotfi-Kamran and
Hamid Sarbazi-Azad",
title = "An Efficient Temporal Data Prefetcher for {L1}
Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "99--102",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2654347",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Server workloads frequently encounter L1-D cache
misses, and hence, lose significant performance
potential. One way to reduce the number of L1-D misses
or their effect is data prefetching. As L1-D access
sequences have high temporal correlations, temporal
prefetching techniques are promising for L1 caches.
State-of-the-art temporal prefetching techniques are
effective at reducing the number of L1-D misses, but we
observe that there is a significant gap between what
they offer and the opportunity. This work aims to
improve the effectiveness of temporal prefetching
techniques. To overcome the deficiencies of existing
temporal prefetchers, we introduce Domino prefetching.
Domino prefetcher is a temporal prefetching technique
that looks up the history to find the last occurrence
of the last one or two L1-D miss addresses for
prefetching. We show that Domino prefetcher captures
more than 87 percent of the temporal opportunity at
L1-D. Through evaluation of a 16-core processor on a
set of server workloads, we show that Domino prefetcher
improves system performance by 26 percent (up to 56
percent).",
acknowledgement = ack-nhfb,
affiliation = "Bakhshalipour, M (Reprint Author), Sharif Univ
Technol, Dept Comp Engn, Tehran 1458889694, Iran.
Bakhshalipour, Mohammad; Sarbazi-Azad, Hamid, Sharif
Univ Technol, Dept Comp Engn, Tehran 1458889694, Iran.
Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
Comp Sci, Tehran 1956836681, Iran.",
author-email = "bakhshalipour@ce.sharif.edu plotfi@ipm.ir
azad@ipm.ir",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Correlation; data prefetching; Domino
prefetcher captures; efficient temporal data
prefetcher; high temporal correlations; L1-D access
sequences; L1-D cache misses; L1-D miss addresses; L1-D
misses; multiprocessing systems; Prefetching; Server
workloads; Servers; storage management; Streaming
media; temporal correlation; temporal opportunity;
temporal prefetching technique; Web search",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Bakhshalipour:2017:ETD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Martinez:2017:SII,
author = "Jorge A. Mart{\'\i}nez and Juan Antonio Maestro and
Pedro Reviriego",
title = "A Scheme to Improve the Intrinsic Error Detection of
the Instruction Set Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "103--106",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2623628",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The Instruction Set Architecture (ISA) determines the
effect that a soft error on an instruction can have on
the processor. Previous works have shown that the ISA
has some intrinsic capability of detecting errors. For
example, errors that change a valid instruction into an
invalid instruction encoding or into an instruction
that causes an exception. The percentage of detectable
errors varies widely for each bit in the ISA. For
example, errors on bits that are used for immediate or
register values are unlikely to be detected while those
that are used for the opcode are more likely to lead to
an exception. In this paper, this is exploited by
introducing a simple encoding of the instructions that
does not require additional bits. The idea is that the
decoding propagates the error so that it affects the
most sensitive bit of the ISA and therefore it is more
likely to be detected. As no additional bits are
required, no changes or overheads are needed in the
memory. The proposed scheme is useful when the memory
is not protected with parity or Error Correction Codes.
The only cost of implementing the technique are simple
encoder and decoder circuits that are similar to a
parity computation. This technique is applicable to any
ISA, no matter the length of the opcodes or their
location in the instruction encoding. The effectiveness
of the proposed scheme has been evaluated on the ARM
Cortex M0 ISA resulting in an increase in the error
detection capability of up to 1.64x.",
acknowledgement = ack-nhfb,
affiliation = "Martinez, JA (Reprint Author), Univ Antonio Nebrija, C
Pirineos 55, Madrid 28040, Spain. Martinez, Jorge A.;
Antonio Maestro, Juan; Reviriego, Pedro, Univ Antonio
Nebrija, C Pirineos 55, Madrid 28040, Spain.",
author-email = "jmartine@nebrija.es jmaestro@nebrija.es
previrie@nebrija.es",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ARM Cortex M0 ISA; Circuit faults; Computer
architecture; decoder circuits; detectable errors;
detecting errors; Encoding; Error analysis; error
correction codes; Error Correction Codes; error
detection; error detection capability; instruction set
architecture; Instruction sets; instruction sets;
intrinsic capability; intrinsic error detection;
invalid instruction encoding; microprocessor chips;
simple encoder; simple encoding; Soft error; soft
error; Soft error",
number-of-cited-references = "10",
ORCID-numbers = "Maestro, Juan Antonio/0000-0001-7133-9026",
research-areas = "Computer Science",
researcherid-numbers = "Maestro, Juan Antonio/L-6091-2014",
times-cited = "3",
unique-id = "Martinez:2017:SII",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2017:DAS,
author = "Rujia Wang and Sparsh Mittal and Youtao Zhang and Jun
Yang",
title = "{Decongest}: Accelerating Super-Dense {PCM} Under
Write Disturbance by Hot Page Remapping",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "107--110",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2675883",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "At small feature sizes, phase change memory (PCM)
shows write disturbance (WD) error (WDE) and this issue
can eclipse the density and energy efficiency advantage
of PCM. We propose `Decongest', a technique to address
WD errors in main memory designed with super-dense
(4F(2) cell size) PCM. Decongest works by identifying
and remapping write-intensive hot pages to a WD-free
spare area, which avoids WD to nearby pages due to
writing these hot pages, and WD to these hot pages from
writing nearby pages. Compared to a WD-affected
super-dense PCM baseline, Decongest improves the
performance by 14.0 percent, and saves 21.8 percent
energy.",
acknowledgement = ack-nhfb,
affiliation = "Wang, RJ (Reprint Author), Univ Pittsburgh,
Pittsburgh, PA 15260 USA. Wang, Rujia; Zhang, Youtao;
Yang, Jun, Univ Pittsburgh, Pittsburgh, PA 15260 USA.
Mittal, Sparsh, IIT Hyderabad, Kandi 502285, Telangana,
India.",
author-email = "rujia.w@pitt.edu sparsh0mittal@gmail.com
youtao@pitt.edu juy9@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US NSF CCF [1617071]; IIT, Hyderabad,
India",
funding-text = "This work is partially supported by US NSF
CCF\#1617071 and a seed-grant from IIT, Hyderabad,
India.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Diseases; Energy management;
energy saving; main memory; Microprocessors; page
remapping; Phase change materials; Phase change memory;
Radiation detectors; reliability; write disturbance",
number-of-cited-references = "13",
ORCID-numbers = "Mittal, Sparsh/0000-0002-2908-993X",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wang:2017:DAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tanimoto:2017:EDG,
author = "Teruo Tanimoto and Takatsugu Ono and Koji Inoue and
Hiroshi Sasaki",
title = "Enhanced Dependence Graph Model for Critical Path
Analysis on Modern Out-of-Order Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "111--114",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2684813",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The dependence graph model of out-of-order (OoO)
instruction execution is a powerful representation used
for the critical path analysis. However most, if not
all, of the previous models are out-of-date and lack
enough detail to model modern OoO processors, or are
too specific and complicated which limit their
generality and applicability. In this paper, we propose
an enhanced dependence graph model which remains simple
but greatly improves the accuracy over prior models.
The evaluation results using the gem5 simulator show
that the proposed enhanced model achieves CPI error of
2.1 percent which is a 90.3 percent improvement against
the state-of-the-art model.",
acknowledgement = ack-nhfb,
affiliation = "Tanimoto, T (Reprint Author), Kyushu Univ, Fukuoka
8190395, Japan. Tanimoto, Teruo; Ono, Takatsugu; Inoue,
Koji, Kyushu Univ, Fukuoka 8190395, Japan. Sasaki,
Hiroshi, Columbia Univ, New York, NY 10027 USA.",
author-email = "teruo.tanimoto@cpc.ait.kyushu-u.ac.jp
takatsugu.ono@cpc.ait.kyushu-u.ac.jp
inoue@ait.kyushu-u.ac.jp sasaki@cs.columbia.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "CREST, JST",
funding-text = "This work was supported in part by CREST, JST. We
would like to express our thanks to RIIT of Kyushu
University for providing us the resource to conduct the
experiments in this paper.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Benchmark testing; computer
architecture; critical path analysis; Delays;
Dependence graph model; enhanced dependence graph
model; graph theory; Hidden Markov models;
Microarchitecture; modern OoO processors; out-of-order
instruction execution; out-of-order processors;
parallel architectures; Path planning; pipeline
processing; Program processors",
number-of-cited-references = "14",
research-areas = "Computer Science",
researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019",
times-cited = "0",
unique-id = "Tanimoto:2017:EDG",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lee:2017:FFE,
author = "Junghee Lee and Kalidas Ganesh and Hyuk-Jun Lee and
Youngjae Kim",
title = "{FESSD}: a Fast Encrypted {SSD} Employing On-Chip
Access-Control Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "115--118",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2667639",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cryptography is one of the most popular methods for
protecting data stored in storage devices such as
solid-state drives (SSDs). To maintain integrity of
data, one of the popular techniques is that all
incoming data are encrypted before they are stored,
however, in this technique, the encryption overhead is
non-negligible and it can increase I/O service time. In
order to mitigate the negative performance impact
caused by the data encryption, a write buffer can be
used to hide the long latency by encryption. Using the
write buffer, incoming unencrypted data can be
immediately returned as soon as they are written in the
buffer. They will get encrypted and synchronized with
flash memory. However, if the write buffer itself is
not encrypted, unencrypted secret data might leak
through this insecure write buffer. On the other hand,
if the entire write buffer is fully encrypted, it
incurs significant performance overhead. To address
this problem, we propose an on-chip access control
memory (ACM) and presents a fast encrypted SSD, called
FESSD that implements a secure write buffering
mechanism using the ACM. The ACM does not require a
memory-level full encryption mechanism, thus not only
solving the unencrypted data leaking problem, but also
offering relatively fast I/O service. Our simulation
results show that the I/O response time of FESSD can be
improved by up to 56 percent over a baseline where
encrypted data are stored in the normal write buffer.",
acknowledgement = ack-nhfb,
affiliation = "Lee, J (Reprint Author), Univ Texas San Antonio, San
Antonio, TX 78249 USA. Lee, Junghee; Ganesh, Kalidas,
Univ Texas San Antonio, San Antonio, TX 78249 USA. Lee,
Hyuk-Jun; Kim, Youngjae, Sogang Univ, Seoul 121742,
South Korea.",
author-email = "junghee.lee@my.utsa.edu dyk567@my.utsa.edu
hyukjunl@sogang.ac.kr youkim@sogang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea (NRF)
--- Korea Government (MISP) [2015R1C1A1A0152105]",
funding-text = "This work was supported by the National Research
Foundation of Korea (NRF) grant funded by the Korea
Government (MISP) (No. 2015R1C1A1A0152105). This
research also used resources of The University of Texas
at San Antonio, San Antonio, TX. Youngjae Kim is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACM; authorisation; cryptography; data encryption;
encrypted data; encryption; Encryption; encryption;
encryption overhead; fast encrypted SSD; FeSSD; flash
memories; flash memory; Hardware; negative performance
impact; Nonvolatile memory; normal write buffer;
on-chip access control memory; on-chip access-control
memory; on-chip memory; Registers; security;
Solid-state drive (SSD); solid-state drives; storage
devices; storage management; System-on-chip;
unencrypted data leaking problem; unencrypted secret
data",
keywords-plus = "SECURITY",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lee:2017:FFE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Badawy:2017:GLO,
author = "Abdel-Hameed A. Badawy and Donald Yeung",
title = "Guiding Locality Optimizations for Graph Computations
via Reuse Distance Analysis",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "119--122",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2695178",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This work addresses the problem of optimizing
graph-based programs for multicore processors. We use
three graph benchmarks and three input data sets to
characterize the importance of properly partitioning
graphs among cores at multiple levels of the cache
hierarchy. We also exhaustively explore a large design
space comprised of different parallelization schemes
and graph partitionings via detailed simulation to show
how much gain we can obtain over a baseline legacy
scheme that partitions for the L1 cache only. Our
results demonstrate the legacy approach is not the best
choice, and that our proposed parallelization /
locality techniques can perform better (by up to 20
percent). We then use a performance prediction model
based on multicore reuse distance (RD) profiles to rank
order the different parallelization / locality schemes
in the design space. We compare the best configuration
as predicted by our model against the actual best
identified by our exhaustive simulations. For one
benchmark and data input, we show our model can achieve
79.5 percent of the performance gain achieved by the
actual best. Across all benchmarks and data inputs, our
model achieves 48 percent of the maximum performance
gain. Our work demonstrates a new use case for
multicore RD profiles --- i.e., as a tool for helping
program developers and compilers to optimize
graph-based programs.",
acknowledgement = ack-nhfb,
affiliation = "Badawy, AHA (Reprint Author), New Mexico State Univ,
Klipsch Sch Elect \& Comp Engn, Las Cruces, NM 88003
USA. Badawy, Abdel-Hameed A., New Mexico State Univ,
Klipsch Sch Elect \& Comp Engn, Las Cruces, NM 88003
USA. Yeung, Donald, Univ Maryland, Dept Elect \& Comp
Engn, College Pk, MD 20742 USA.",
author-email = "badawy@nmsu.edu yeung@umd.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "baseline legacy scheme; Benchmark testing; cache
hierarchy; cache storage; Computational modeling; graph
benchmarks; graph computations; graph partitionings;
graph theory; legacy approach; locality optimization;
memory system; Multicore processing; multicore
processors; multicore RD profiles; multicore reuse
distance profiles; multiprocessing systems;
Optimization; partitioning; performance prediction
model; prediction; Predictive models; profiling;
program developers; Program processors; reuse distance;
reuse distance analysis; Runtime",
keywords-plus = "BIOMOLECULAR SIMULATION",
number-of-cited-references = "11",
ORCID-numbers = "Badawy, Abdel-Hameed/0000-0001-8027-1449",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Badawy:2017:GLO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zha:2017:IFM,
author = "Yue Zha and Jing Li",
title = "{IMEC}: a Fully Morphable In-Memory Computing Fabric
Enabled by Resistive Crossbar",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "123--126",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2672558",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/pvm.bib",
abstract = "In this paper, we propose a fully morphable In-MEmory
Computing (IMEC) fabric to better implement the concept
of processing inside memory (PIM). Enabled by emerging
nonvolatile memory, i.e., RRAM and its monolithic 3D
integration, IMEC can be configured into one or a
combination of four distinct functions, (1) logic, (2)
ternary content addressable memory, (3) memory, and (4)
interconnect. Thus, IMEC exploits a continuum of PIM
capabilities across the whole spectrum, ranging from 0
percent (pure data storage) to 100 percent (pure
compute engine), or intermediate states in between.
IMEC can be modularly integrated into the DDRx memory
subsystem, communicating with processors by the
ordinary DRAM commands. Additionally, to reduce the
programming burden, we provide a complete framework to
compile applications written in high-level programming
language (e.g., OpenCL) onto IMEC. This framework also
enables code portability across different platforms for
heterogeneous computing. By using this framework,
several benchmarks are mapped onto IMEC for evaluating
its performance, energy and resource utilization. The
simulation results show that, IMEC reduces the energy
consumption by 99.6 percent, and achieves 644x speedup,
compared to a baseline CPU system. We further compare
IMEC with FPGA architecture, and demonstrate that the
performance improvement is not simply obtained by
replacing SRAM cells with denser RRAM cells.",
acknowledgement = ack-nhfb,
affiliation = "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp
Engn Dept, Madison, WI 53706 USA. Zha, Yue; Li, Jing,
Univ Wisconsin, Elect \& Comp Engn Dept, Madison, WI
53706 USA.",
author-email = "yzha3@wisc.edu jli587@wisc.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Decoding; Energy efficiency; energy-efficiency
computing; Field programmable gate arrays; Non-volatile
memory; Nonvolatile memory; processing-in-memory;
Program processors; TCAM",
keywords-plus = "ARCHITECTURE",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Zha:2017:IFM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chen:2017:IGP,
author = "Li-Jhan Chen and Hsiang-Yun Cheng and Po-Han Wang and
Chia-Lin Yang",
title = "Improving {GPGPU} Performance via Cache Locality Aware
Thread Block Scheduling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "127--131",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2693371",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Modern GPGPUs support the concurrent execution of
thousands of threads to provide an energy-efficient
platform. However, the massive multi-threading of
GPGPUs incurs serious cache contention, as the cache
lines brought by one thread can easily be evicted by
other threads in the small shared cache. In this paper,
we propose a software-hardware cooperative approach
that exploits the spatial locality among different
thread blocks to better utilize the precious cache
capacity. Through dynamic locality estimation and
thread block scheduling, we can capture more
performance improvement opportunities than prior work
that only explores the spatial locality between
consecutive thread blocks. Evaluations across diverse
GPGPU applications show that, on average, our
locality-aware scheduler provides 25 and 9 percent
performance improvement over the commonly-employed
round-robin scheduler and the state-of-the-art
scheduler, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Chen, LJ (Reprint Author), Natl Taiwan Univ, Taipei
10617, Taiwan. Chen, Li-Jhan; Wang, Po-Han; Yang,
Chia-Lin, Natl Taiwan Univ, Taipei 10617, Taiwan.
Cheng, Hsiang-Yun, Acad Sinica, Taipei 11529, Taiwan.",
author-email = "r03922026@csie.ntu.edu.tw hycheng@citi.sinica.edu.tw
f96922002@csie.ntu.edu.tw yangc@csie.ntu.edu.tw",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Ministry of Science and Technology of
Taiwan [MOST-105-2221-E-002-156-MY2,
MOST-105-2622-8-002-002, MOST-105-2218-E-002-025];
MediaTek Inc., Hsin-chu, Taiwan",
funding-text = "This work is supported in part by research grants from
the Ministry of Science and Technology of Taiwan
(MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002,
and MOST-105-2218-E-002-025), and sponsored by MediaTek
Inc., Hsin-chu, Taiwan.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache lines; cache locality; cache locality aware
thread block scheduling; Cache memory; cache storage;
consecutive thread blocks; Dispatching; dynamic
locality estimation; energy-efficient platform; GPGPU;
GPGPU performance; graphics processing units; Graphics
processing units; Instruction sets; locality-aware
scheduler; multi-threading; performance improvement
opportunities; precious cache capacity; processor
scheduling; serious cache contention; shared cache;
thread block scheduling; Two dimensional displays",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Chen:2017:IGP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Garland:2017:LCM,
author = "James Garland and David Gregg",
title = "Low Complexity Multiply Accumulate Unit for
Weight-Sharing Convolutional Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "132--135",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2656880",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Convolutional Neural Networks (CNNs) are one of the
most successful deep machine learning technologies for
processing image, voice and video data. CNNs require
large amounts of processing capacity and memory, which
can exceed the resources of low power mobile and
embedded systems. Several designs for hardware
accelerators have been proposed for CNNs which
typically contain large numbers of Multiply Accumulate
(MAC) units. One approach to reducing data sizes and
memory traffic in CNN accelerators is ``weight
sharing'', where the full range of values in a trained
CNN are put in bins and the bin index is stored instead
of the original weight value. In this paper we propose
a novel MAC circuit that exploits binning in
weight-sharing CNNs. Rather than computing the MAC
directly we instead count the frequency of each weight
and place it in a bin. We then compute the accumulated
value in a subsequent multiply phase. This allows
hardware multipliers in the MAC circuit to be replaced
with adders and selection logic. Experiments show that
for the same clock speed our approach results in fewer
gates, smaller logic, and reduced power.",
acknowledgement = ack-nhfb,
affiliation = "Garland, J (Reprint Author), Trinity Coll Dublin,
Dublin 2, Ireland. Garland, James; Gregg, David,
Trinity Coll Dublin, Dublin 2, Ireland.",
author-email = "jgarland@tcd.ie david.gregg@cs.tcd.ie",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Science Foundation Ireland [12/IA/1381]",
funding-text = "This research is supported by Science Foundation
Ireland, Project 12/IA/1381.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adders; arithmetic hardware circuits; bin index; CNN
accelerators; convolution; Convolutional neural
network; Convolutional neural networks; deep machine
learning technologies; embedded systems; Energy
efficiency; feedforward neural nets; hardware
accelerators; hardware multipliers; learning
(artificial intelligence); Logic gates; MAC circuit;
Machine learning; memory traffic; multiply accumulate;
multiply accumulate units; multiplying circuits; Neural
networks; original weight value; power efficiency;
subsequent multiply phase; video data; weight-sharing
CNN; weight-sharing convolutional neural networks",
number-of-cited-references = "9",
ORCID-numbers = "Garland, James/0000-0002-8688-9407",
research-areas = "Computer Science",
researcherid-numbers = "Garland, James/L-1294-2019",
times-cited = "2",
unique-id = "Garland:2017:LCM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jung:2017:NIP,
author = "Myoungsoo Jung",
title = "{NearZero}: an Integration of Phase Change Memory with
Multi-Core Coprocessor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "136--140",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2694828",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multi-core based coprocessors have become powerful
research vehicles to analyze a large amount of data.
Even though they can accelerate data processing by
using a hundred cores, the data unfortunately exist on
an external storage device. The separation of
computation and storage introduces redundant memory
copies and unnecessary data transfers over different
physical device boundaries, which limit the benefits of
coprocessor-accelerated data processing. In addition,
the coprocessors need assistance from host-side
resources to access the external storage, which can
require additional system context switches. To address
these challenges, we propose NearZero, a novel
DRAM-less coprocessor architecture that precisely
integrates a state-of-the-art phase change memory into
its multi-core accelerator. In this work, we implement
an FPGA-based memory controller that extracts important
device parameters from real phase change memory chips,
and apply them to a commercially available hardware
platform that employs multiple processing elements over
a PCIe fabric. The evaluation results reveal that
NearZero achieves on average 47 percent better
performance than advanced coprocessor approaches that
use direct I/Os (between storage and coprocessors),
while consuming only 19 percent of the total energy of
such advanced coprocessors.",
acknowledgement = ack-nhfb,
affiliation = "Jung, M (Reprint Author), Yonsei Univ, Seoul 03722,
South Korea. Jung, Myoungsoo, Yonsei Univ, Seoul 03722,
South Korea.",
author-email = "m.jung@yonsei.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NRF [2016R1C1B2015312, DE-AC02-05CH 11231];
MSIP [IITP-2017-2017-0-01015]; [MemRay 2015-11-1731]",
funding-text = "The author thanks MemRay Corporation, Samsung, TI for
their research sample donation and technical support.
The author also thanks J. Zhang, H. Jeong and G. Park
who help him prepare to set up preliminary evaluation
environment. This research is supported by MemRay
2015-11-1731. This work is also supported in part by
NRF 2016R1C1B2015312, DE-AC02-05CH 11231 and MSIP
IITP-2017-2017-0-01015.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerators; additional system context; advanced
coprocessor approaches; Computer architecture;
coprocessors; Coprocessors; data processing; Data
storage; Data transfer; DRAM chips; DRAM-less
coprocessor architecture; external storage device;
Field programmable gate arrays; field programmable gate
arrays; hardware architecture; host-side resources;
hybrid systems; important device parameters; mass
storage; memory structures; multicore accelerator;
multicore-based coprocessors; multiple processing
elements; multiprocessing systems; multiprocessors;
NearZero; Network architecture; non-volatile memory;
Nonvolatile memory; parallel architectures; phase
change memories; phase change memory chips; Phase
change random access memory; powerful research
vehicles; redundant memory copies; Storage devices;
storage management; unnecessary data transfers",
number-of-cited-references = "12",
research-areas = "Computer Science",
researcherid-numbers = "Jung, Myoungsoo/F-4565-2019",
times-cited = "2",
unique-id = "Jung:2017:NIP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yavits:2017:RAD,
author = "Leonid Yavits and Uri Weiser and Ran Ginosar",
title = "Resistive Address Decoder",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "141--144",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2670539",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardwired dynamic NAND address decoders are widely
used in random access memories to decode parts of the
address. Replacing wires by resistive elements allows
storing and reprogramming the addresses and matching
them to an input address. The resistive address decoder
thus becomes a content addressable memory, while the
read latency and dynamic energy remain almost identical
to those of a hardwired address decoder. One
application of the resistive address decoder is a fully
associative TLB with read latency and energy
consumption similar to those of a one-way associative
TLB. Another application is a many-way associative
cache with read latency and energy consumption similar
to those of a direct mapped one. A third application is
elimination of physical addressing and using virtual
addresses throughout the entire memory hierarchy by
introducing the resistive address decoder into the main
memory.",
acknowledgement = ack-nhfb,
affiliation = "Yavits, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
Yavits, Leonid; Weiser, Uri; Ginosar, Ran, Technion
Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa,
Israel.",
author-email = "yavits@tx.technion.ac.il uri.weiser@ee.technion.ac.il
ran@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Address decoder; cache; cache storage; CAM; content
addressable memory; content-addressable storage;
Decoding; decoding; dynamic energy; energy consumption;
Energy consumption; fully associative TLB; hardwired
address decoder; hardwired dynamic NAND address
decoders; Logic gates; many-way associative cache;
memory hierarchy; memristors; Memristors; memristors;
NAND circuits; Network address translation; one-way
associative TLB; physical address; physical addressing
using virtual addresses; Programming; RAM; random
access memories; Random access memory; random-access
storage; read latency; resistive address decoder;
resistive memory; TLB; virtual address; virtual
addresses",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Yavits:2017:RAD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manivannan:2017:RAG,
author = "Madhavan Manivannan and Miquel Peric{\`a}s and
Vassilis Papaefstathiou and Per Stenstr{\"o}m",
title = "Runtime-Assisted Global Cache Management for
Task-Based Parallel Programs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "145--148",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2606593",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Dead blocks are handled inefficiently in multi-level
cache hierarchies because the decision as to whether a
block is dead has to be taken locally at each cache
level. This paper introduces runtime-assisted global
cache management to quickly deem blocks dead across
cache levels in the context of task-based parallel
programs. The scheme is based on a cooperative
hardware/software approach that leverages static and
dynamic information about future data region reuse(s)
available to runtime systems for task-based parallel
programming models. We show that our proposed
runtime-assisted global cache management approach
outperforms previously proposed local dead-block
management schemes for task-based parallel programs.",
acknowledgement = ack-nhfb,
affiliation = "Manivannan, M (Reprint Author), Chalmers Univ Technol,
Dept Comp Sci \& Engn, S-41258 Gothenburg, Sweden.
Manivannan, Madhavan; Pericas, Miquel; Papaefstathiou,
Vassilis; Stenstrom, Per, Chalmers Univ Technol, Dept
Comp Sci \& Engn, S-41258 Gothenburg, Sweden.",
author-email = "madhavan@chalmers.se miquelp@chalmers.se
vaspap@chalmers.se per.stenstrom@chalmers.se",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Swedish Foundation for Strategic Research
(SSF) under SCHEME project [RIT10-0033]; European
Research Council (ERC) under MECCA project [340328]",
funding-text = "This research is supported by grants from the Swedish
Foundation for Strategic Research (SSF) under the
SCHEME project (RIT10-0033) and the European Research
Council (ERC) under the MECCA project (contract
340328). The simulations were run on the resources
provided by the Swedish National Infrastructure for
Computing (SNIC) at C3SE.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache level; Cache memory; cache storage; Data models;
dead blocks; dead-block management schemes; Multi-level
cache hierarchies; multilevel cache hierarchies;
Optimization; parallel programming; Parallel
programming; parallel programming models; parallel
programs; prediction; Predictive models; run-time
system; Runtime; runtime systems; runtime-assisted
global cache management; Semantics; storage
management",
keywords-plus = "REPLACEMENT; PREDICTION",
number-of-cited-references = "20",
oa = "Bronze",
ORCID-numbers = "Stenstrom, Per/0000-0002-4280-3843",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Manivannan:2017:RAG",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Perais:2017:SFM,
author = "Arthur Perais and Andre Seznec",
title = "Storage-Free Memory Dependency Prediction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "149--152",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2628379",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory Dependency Prediction (MDP) is paramount to
good out-of-order performance, but decidedly not
trivial as a all instances of a given static load may
not necessarily depend on all instances of a given
static store. As a result, for a given load, MDP should
predict the exact store instruction the load depends
on, and not only whether it depends on an inflight
store or not, i.e., ideally, prediction should not be
binary. However, we first argue that given the high
degree of sophistication of modern branch predictors,
the fact that a given dynamic load depends on an
inflight store can be captured using the binary
prediction capabilities of the branch predictor,
providing coarse MDP at zero storage overhead. Second,
by leveraging hysteresis counters, we show that the
precise producer store can in fact be identified. This
embodiment of MDP yields performance levels that are on
par with state-of-the-art, and requires less than 70
additional bits of storage over a baseline without MDP
at all.",
acknowledgement = ack-nhfb,
affiliation = "Perais, A (Reprint Author), INRIA IRISA, F-35000
Rennes, France. Perais, Arthur; Seznec, Andre, INRIA
IRISA, F-35000 Rennes, France.",
author-email = "arthur.perais@inria.fr andre.seznec@inria.fr",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "binary prediction capabilities; branch prediction
space-efficiency; branch predictor; cache storage;
coarse MDP; instruction sets; MDP yields performance
levels; Memory dependency prediction; memory dependency
prediction; Memory management; modern branch
predictors; Out of order; out-of-order performance;
precise producer store; Predictive models; storage
management; storage-free memory dependency prediction;
zero storage overhead",
keywords-plus = "COMMUNICATION; QUEUE",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Perais:2017:SFM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mirhosseini:2017:SPB,
author = "Amirhossein Mirhosseini and Aditya Agrawal and Josep
Torrellas",
title = "{Survive}: Pointer-Based In-{DRAM} Incremental
Checkpointing for Low-Cost Data Persistence and
Rollback-Recovery",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "153--157",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2646340",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper introduces the Survive DRAM architecture
for effective in-memory micro-checkpointing. Survive
implements low-cost incremental checkpointing, enabling
fast rollback that can be used in various architectural
techniques such as speculation, approximation, or low
voltage operation. Survive also provides crash
consistency when used as the frontend of a hybrid
DRAM-NVM memory system. This is accomplished by
carefully copying the incremental checkpoints generated
in the DRAM frontend to the NVM backend. Simulations
show that Survive only imposes an average 3.5 percent
execution time overhead over an unmodified DRAM
main-memory system with no checkpointing, while
reducing the number of NVM writes by 89 percent over an
NVM-only main-memory system.",
acknowledgement = ack-nhfb,
affiliation = "Mirhosseini, A (Reprint Author), Univ Michigan, Ann
Arbor, MI 48109 USA. Mirhosseini, Amirhossein, Univ
Michigan, Ann Arbor, MI 48109 USA. Agrawal, Aditya,
NVIDIA Corp, Santa Clara, CA 95050 USA. Torrellas,
Josep, Univ Illinois, Champaign, IL 61801 USA.",
author-email = "miramir@umich.edu adityaa@nvidia.com
torrella@illinois.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural techniques; checkpointing;
Checkpointing; checkpointing; Computer architecture;
Computer crashes; DRAM chips; hybrid DRAM-NVM memory
system; In-DRAM incremental checkpointing; in-memory
microcheckpointing; incremental checkpoints; low
voltage operation; low-cost data persistence; low-cost
incremental checkpointing; memory architecture;
Non-volatile memory; Nonvolatile memory; NVM-only
main-memory system; Random access memory; random-access
storage; reliability; rollback-recovery; software fault
tolerance; survive DRAM architecture; system recovery;
Transistors; unmodified DRAM main-memory system",
keywords-plus = "PHASE-CHANGE MEMORY",
number-of-cited-references = "21",
ORCID-numbers = "Mirhosseini, Amirhossein/0000-0001-6501-6087",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Mirhosseini:2017:SPB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Pinto:2017:TTA,
author = "Sandro Pinto and Jorge Pereira and Tiago Gomes and
Mongkol Ekpanyapong and Adriano Tavares",
title = "Towards a {TrustZone}-Assisted Hypervisor for
Real-Time Embedded Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "158--161",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2617308",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Virtualization technology starts becoming more and
more widespread in the embedded space. The penalties
incurred by standard software-based virtualization is
pushing research towards hardware-assisted solutions.
Among the existing commercial off-the-shelf
technologies for secure virtualization, ARM TrustZone
is attracting particular attention. However, it is
often seen with some scepticism due to the dual-OS
limitation of existing state-of-the-art solutions. This
letter presents the implementation of a TrustZone-based
hypervisor for real-time embedded systems, which allows
multiple RTOS partitions on the same hardware platform.
The results demonstrate that virtualization overhead is
less than 2 percent for a 10 milliseconds
guest-switching rate, and the system remains
deterministic. This work goes beyond related work by
implementing a TrustZone-assisted solution that allows
the execution of an arbitrary number of guest OSes
while providing the foundation to drive next generation
of secure virtualization solutions for
resource-constrained embedded devices.",
acknowledgement = ack-nhfb,
affiliation = "Pinto, S (Reprint Author), Univ Minho, Dept Ctr
Algoritmi, P-4704553 Braga, Portugal. Pinto, Sandro;
Pereira, Jorge; Gomes, Tiago; Tavares, Adriano, Univ
Minho, Dept Ctr Algoritmi, P-4704553 Braga, Portugal.
Ekpanyapong, Mongkol, Asian Inst Technol, Pathum Thani
12120, Thailand.",
author-email = "sandro.pinto@algoritmi.uminho.pt
jorge.m.pereira@algoritmi.uminho.pt
tiago.m.gomes@algoritmi.uminho.pt mongkol@ait.ac.th
adriano.tavares@algoritmi.uminho.pt",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "COMPETE [POCI-01-0145-FEDER-007043]; FCT -
Fundacao para a Ciencia e Tecnologia
[SFRH/BD/91530/2012, UID/CEC/00319/2013]",
funding-text = "This work has been supported by COMPETE:
POCI-01-0145-FEDER-007043 and FCT --- Fundacao para a
Ciencia e Tecnologia (grant SFRH/BD/91530/2012 and
UID/CEC/00319/2013).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ARM; ARM TrustZone; dual-OS limitation; embedded
space; embedded systems; Embedded systems; embedded
systems; hardware platform; hardware-assisted
solutions; monitor; Monitoring; multiple RTOS
partitions; operating systems (computers); Program
processors; real-time; real-time embedded systems;
Real-time systems; RODOS; secure virtualization
solutions; security of data; standard software; trusted
computing; TrustZone; TrustZone-assisted solution;
Virtual machine monitors; virtualisation;
Virtualization; virtualization overhead; virtualization
technology",
number-of-cited-references = "12",
ORCID-numbers = "Gomes, Tiago/0000-0002-4071-9015 Salgado Pinto,
Sandro Emanuel/0000-0003-4580-7484 Tavares,
Adriano/0000-0001-8316-6927",
research-areas = "Computer Science",
researcherid-numbers = "Gomes, Tiago/A-4751-2016 Salgado Pinto, Sandro
Emanuel/D-6725-2015 Tavares, Adriano/M-5257-2013",
times-cited = "3",
unique-id = "Pinto:2017:TTA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Carlson:2017:THL,
author = "Trevor E. Carlson and Kim-Anh Tran and Alexandra
Jimborean and Konstantinos Koukos and Magnus
Sj{\"a}lander and Stefanos Kaxiras",
title = "Transcending Hardware Limits with Software
Out-of-Order Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "162--165",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2672559",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Building high-performance, next-generation processors
require novel techniques to enable improved performance
given today's power-and energy-efficiency requirements.
Additionally, a widening gap between processor and
memory performance makes it even more difficult to
improve efficiency with conventional techniques. While
out-of-order architectures attempt to hide this memory
latency with dynamically reordered instructions, they
lack the energy efficiency seen in in-order processors.
Thus, our goal is to reorder the instruction stream to
avoid stalls and improve utilization for energy
efficiency and performance. To accomplish this goal, we
propose an enhanced stall-on-use in-order core that
improves energy efficiency (and therefore performance
in these power-limited designes) through
out-of-program-order execution. During long latency
loads, the Software Out-of-Order Processing (SWOOP)
core exposes additional memory-and instruction-level
parallelism to perform useful, non-speculative work.
The resulting instruction lookahead of the SWOOP core
reaches beyond the conventional fixed-sized processor
structures with the help of transparent hardware
register contexts. Our results show that SWOOP
demonstrates a 34 percent performance improvement on
average compared with an in-order, stall-on-use core,
with an energy reduction of 23 percent.",
acknowledgement = ack-nhfb,
affiliation = "Carlson, TE (Reprint Author), Uppsala Univ, S-75236
Uppsala, Sweden. Carlson, Trevor E.; Tran, Kim-Anh;
Jimborean, Alexandra; Koukos, Konstantinos; Sjalander,
Magnus; Kaxiras, Stefanos, Uppsala Univ, S-75236
Uppsala, Sweden. Sjalander, Magnus, Norwegian Univ Sci
\& Technol NTNU, N-7491 Trondheim, Norway.",
author-email = "trevor.carlson@it.uu.se kim-anh.tran@it.uu.se
alexandra.jimborean@it.uu.se
konstantinos.koukos@it.uu.se
magnus.sjalander@idi.ntnu.no
stefanos.kaxiras@it.uu.se",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Compilation; Context awareness; decoupled
access-execute; dynamically reordered instructions;
energy; energy conservation; energy efficiency; Energy
management; energy reduction; energy-efficiency
requirements; enhanced stall-on-use; fixed-sized
processor structures; hardware limits; in-order core;
in-order processors; instruction stream;
instruction-level parallelism; memory level
parallelism; microprocessor chips; next-generation
processors; Out of order; out-of-program-order
execution; parallel architectures; power-limited
designes; Prefetching; resulting instruction lookahead;
software out-of-order processing; stall-on-use core;
SWOOP",
number-of-cited-references = "9",
ORCID-numbers = "Sjalander, Magnus/0000-0003-4232-6976 Jimborean,
Alexandra/0000-0001-8642-2447",
research-areas = "Computer Science",
researcherid-numbers = "Sjalander, Magnus/N-5995-2019",
times-cited = "0",
unique-id = "Carlson:2017:THL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ahmadvand:2017:UDV,
author = "Hossein Ahmadvand and Maziar Goudarzi",
title = "Using Data Variety for Efficient Progressive Big Data
Processing in Warehouse-Scale Computers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "166--169",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2636293",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Warehouse Scale Computers (WSC) are often used for
various big data jobs where the big data under
processing comes from a variety of sources. We show
that different data portions, from the same or
different sources, have different significances in
determining the final outcome of the computation, and
hence, by prioritizing them and assigning more
resources to processing of more important data, the WSC
can be used more efficiently in terms of time as well
as cost. We provide a simple low-overhead mechanism to
quickly assess the significance of each data portion,
and show its effectiveness in finding the best ranking
of data portions. We continue by demonstrating how this
ranking is used in resource allocation to improve time
and cost by up to 24 and 9 percent respectively, and
also discuss other uses of this ranking information,
e.g., in faster progressive approximation of the final
outcome of big data job without processing entire data,
and in more effective use of renewable energies in
WSCs.",
acknowledgement = ack-nhfb,
affiliation = "Ahmadvand, H (Reprint Author), Sharif Univ Technol,
Dept Comp Engn, Azadi Ave, Tehran 1136511155, Iran.
Ahmadvand, Hossein; Goudarzi, Maziar, Sharif Univ
Technol, Dept Comp Engn, Azadi Ave, Tehran 1136511155,
Iran.",
author-email = "ahmadvand@ce.sharif.edu goudarzi@sharif.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Sharif University of Technology [G930826]",
funding-text = "This research is supported by grant number G930826
from Sharif University of Technology. We are grateful
for their support.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Big data; Big Data; Big data;
Computers; data warehouses; Distributed databases;
efficiency; efficient progressive Big Data processing;
order of processing; resource allocation; Resource
management; sampling; warehouse-scale computers; WSC",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Ahmadvand:2017:UDV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhang:2017:WDP,
author = "Dan Zhang and Xiaoyu Ma and Derek Chiou",
title = "Worklist-Directed Prefetching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "16",
number = "2",
pages = "170--173",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2627571",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Researchers have demonstrated the benefits of hardware
worklist accelerators, which offload scheduling and
load balancing operations in parallel graph
applications. However, many of these applications are
still heavily memory latency-bound due to the irregular
nature of graph data structure access patterns. We
utilize the fact that the accelerator has knowledge of
upcoming work items to accurately issue prefetch
requests, a technique we call worklist-directed
prefetching. A credit-based system to improve prefetch
timeliness and prevent cache thrashing is proposed. The
proposed prefetching scheme is simulated on a 64-core
CMP with a hardware worklist accelerator on several
graph algorithms and inputs. Enabling worklist-directed
prefetching into the L2 cache results in an average
speedup of 1.99, and up to 2.35 on Breadth-First
Search.",
acknowledgement = ack-nhfb,
affiliation = "Zhang, D (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Zhang, Dan;
Ma, Xiaoyu; Chiou, Derek, Univ Texas Austin, Dept Elect
\& Comp Engn, Austin, TX 78712 USA.",
author-email = "dan.zhang@utexas.edu xma@utexas.edu
derek@ece.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerators; cache storage; data structures; graph
algorithms; graph data structure access patterns; graph
problems; graph theory; Hardware; hardware worklist
accelerator; load-balancing operations; microprocessor
chips; parallel graph applications; parallel
processors; Prefetching; prefetching researchers;
prefetching scheme; Processor scheduling; resource
allocation; scheduling; Software algorithms; storage
management",
keywords-plus = "ARCHITECTURAL SUPPORT; ALGORITHM",
number-of-cited-references = "23",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zhang:2017:WDP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Scionti:2018:EMM,
author = "Alberto Scionti and Somnath Mazumdar and Stephane
Zuckerman",
title = "Enabling Massive Multi-Threading with Fast Hashing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2697863",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The next generation of high-performance computers is
expected to execute threads in orders of magnitude
higher than today's systems. Improper management of
such huge amount of threads can create resource
contention, leading to overall degraded system
performance. By leveraging more practical approaches to
distribute threads on the available resources,
execution models and manycore chips are expected to
overcome limitations of current systems. Here, we
present DELTA --- a Data-Enabled muLti-Threaded
Architecture, where a producer-consumer scheme is used
to execute threads via complete distributed thread
management mechanism. We consider a manycore tiled-chip
architecture where Network-on-Chip (NoC) routers are
extended to support our execution model. The proposed
extension is analysed, while simulation results confirm
that DELTA can manage a large number of simultaneous
threads, relying on a simple hardware structure.",
acknowledgement = ack-nhfb,
affiliation = "Scionti, A (Reprint Author), ISMB, I-10138 Turin,
Italy. Scionti, Alberto, ISMB, I-10138 Turin, Italy.
Mazumdar, Somnath, Univ Siena, Siena, SI, Italy.
Zuckerman, Stephane, Michigan Technol Univ, Houghton,
MI 49931 USA.",
author-email = "scionti@ismb.it mazumdar@dii.unisi.it
szuckerm@mtu.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "complete distributed thread management mechanism;
Computational modeling; Computer architecture;
data-enabled multithreaded architecture; Dataflow;
degraded system performance; DELTA; execution model;
fast hashing; Hardware; hashing; high-performance
computers; Instruction sets; manycore chips; manycore
tiled-chip architecture; massive multihreading;
microprocessor chips; multi-threading; multiprocessing
systems; network-on-chip; network-on-chip routers;
Organizations; producer-consumer scheme; Programming;
resource contention; Scheduling; simultaneous threads;
thread-scheduling",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Scionti:2018:EMM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2018:IIC,
author = "Anonymous",
title = "2017 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 16",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "1--6",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2799560",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jeon:2018:HMP,
author = "Dong-Ik Jeon and Kyeong-Bin Park and Ki-Seok Chung",
title = "{HMC-MAC}: Processing-in Memory Architecture for
Multiply--Accumulate Operations with Hybrid Memory
Cube",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2700298",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Many studies focus on implementing processing-in
memory (PIM) on the logic die of the hybrid memory cube
(HMC) architecture. The multiply-accumulate (MAC)
operation is heavily used in digital signal processing
(DSP) systems. In this paper, a novel PIM architecture
called HMC-MAC that implements the MAC operation in the
HMC is proposed. The vault controllers of the
conventional HMC are working independently to maximize
the parallelism, and HMC-MAC is based on the
conventional HMC without modifying the architecture
much. Therefore, a large number of MAC operations can
be processed in parallel. In HMC-MAC, the MAC operation
can be carried out simultaneously with as much as 128
KB data. The correctness on HMC-MAC is verified by
simulations, and its performance is better than the
conventional CPU-based MAC operation when the MAC
operation is consecutively executed at least six
times",
acknowledgement = ack-nhfb,
affiliation = "Chung, KS (Reprint Author), Hanyang Univ, Dept Elect
\& Comp Engn, Seoul 04763, South Korea. Jeon, Dong-Ik;
Park, Kyeong-Bin; Chung, Ki-Seok, Hanyang Univ, Dept
Elect \& Comp Engn, Seoul 04763, South Korea.",
author-email = "estwingz@naver.com lay1523@naver.com
kchung@hanyang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea(NRF) --- Ministry
of Education [NRF-2015R1D1A1A09061079]",
funding-text = "This research was supported by Basic Science Research
Program through the National Research Foundation of
Korea(NRF) funded by the Ministry of Education
(NRF-2015R1D1A1A09061079).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computers; CPU-based MAC operation; digital signal
processing; digital signal processing systems; DRAM
chips; DSP systems; Electronic mail; HMC-MAC; hybrid
memory cube architecture; logic circuits; logic die;
memory architecture; Memory architecture; Memory
management; memory size 128.0 KByte; Memory structures;
memory used as logic; multiple data stream
architectures; multiply-accumulate operation; parallel
processing; processing-in memory architecture; Random
access memory; Registers; vault controllers",
number-of-cited-references = "11",
ORCID-numbers = "Jeon, Dong-Ik/0000-0002-8572-4184",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Jeon:2018:HMP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{VandenSteen:2018:MSP,
author = "Sam {Van den Steen} and Lieven Eeckhout",
title = "Modeling Superscalar Processor Memory-Level
Parallelism",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2701370",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes an analytical model to predict
Memory-Level Parallelism (MLP) in a superscalar
processor. We profile the workload once and measure a
set of distributions to characterize the workload's
inherent memory behavior. We subsequently generate a
virtual instruction stream, over which we then process
an abstract MLP model to predict MLP for a particular
micro-architecture with a given ROB size, LLC size,
MSHR size and stride-based prefetcher. Experimental
evaluation reports an improvement in modeling error
from 16.9 percent for previous work to 3.6 percent on
average for the proposed model.",
acknowledgement = ack-nhfb,
affiliation = "Van den Steen, S (Reprint Author), Univ Ghent, Ghent,
Belgium. Van den Steen, Sam; Eeckhout, Lieven, Univ
Ghent, Ghent, Belgium.",
author-email = "sam.vandensteen@ugent.be lieven.eeckhout@ugent.be",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Agency for Innovation by Science and
Technology (IWT)",
funding-text = "We thank the anonymous reviewers for their
constructive and insightful feedback. Sam Van den Steen
is supported through a doctoral fellowship by the
Agency for Innovation by Science and Technology
(IWT).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Computational modeling; Computer
architecture; Hardware; LLC size; Load modeling; memory
architecture; memory level parallelism (MLP);
micro-architecture; MLP model; Modeling; MSHR size;
Predictive models; Prefetching; ROB size; superscalar
processor memory-level parallelism modeling; virtual
instruction stream",
number-of-cited-references = "11",
ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "denSteen:2018:MSP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Durkovic:2018:BNS,
author = "Srdjan Durkovic and Zoran Cica",
title = "{Birkhoff--von Neumann} Switch Based on Greedy
Scheduling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2707082",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is important to develop high performance packet
switches that are highly scalable. Among the popular
solutions are input queued (IQ) switches and load
balanced Birkhoff-von Neumann (LB-BvN) switches.
However, both solutions have their drawbacks. Switch
configuration pattern in IQ switches is random which
can limit the supported port speed. On the other hand,
LB-BvN switches require two switching stages which
increase the overall cost. Also, some LB-BvN solutions
suffer from the packet out of sequence problem. In this
paper, we propose a novel packet switch architecture
that combines the best properties of the IQ and LB-BvN
switches and eliminates their drawbacks.",
acknowledgement = ack-nhfb,
affiliation = "Cica, Z (Reprint Author), Univ Belgrade, Sch Elect
Engn, Belgrade 11120, Serbia. Durkovic, Srdjan; Cica,
Zoran, Univ Belgrade, Sch Elect Engn, Belgrade 11120,
Serbia.",
author-email = "srdjad6@gmail.com zoran.cica@etf.rs",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Architecture; data communications; Delays; greedy
scheduling; high performance packet; Internet; IP
networks; IQ switches; LB-BvN solutions; LB-BvN
switches; load balanced Birkhoff-von Neumann switches;
packet switch architecture; packet switching;
packet-switching networks; Ports (Computers); queueing
theory; Random access memory; resource allocation;
routers; Scheduling; switch configuration pattern;
Switches; switching stages; telecommunication
scheduling",
keywords-plus = "2-STAGE SWITCHES; DESIGN; ALGORITHM",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Durkovic:2018:BNS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Pham:2018:TSM,
author = "Binh Pham and Derek Hower and Abhishek Bhattacharjee
and Trey Cain",
title = "{TLB} Shootdown Mitigation for Low-Power Many-Core
Servers with {L1} Virtual Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2712140",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Power efficiency has become one of the most important
design constraints for high-performance systems. In
this paper, we revisit the design of low-power
virtually-addressed caches. While virtually-addressed
caches enable significant power savings by obviating
the need for Translation Lookaside Buffer (TLB)
lookups, they suffer from several challenging design
issues that curtail their widespread commercial
adoption. We focus on one of these challenges-cache
flushes due to virtual page remappings. We use detailed
studies on an ARM many-core server to show that this
problem degrades performance by up to 25 percent for a
mix of multi-programmed and multi-threaded workloads.
Interestingly, we observe that many of these flushes
are spurious, and caused by an indiscriminate
invalidation broadcast on ARM architecture. In
response, we propose a low-overhead and readily
implementable hardware mechanism using bloom filters to
reduce spurious invalidations and mitigate their ill
effects.",
acknowledgement = ack-nhfb,
affiliation = "Pham, B (Reprint Author), Rutgers State Univ, Dept
Comp Sci, Piscataway, NJ 08854 USA. Binh Pham;
Bhattacharjee, Abhishek, Rutgers State Univ, Dept Comp
Sci, Piscataway, NJ 08854 USA. Hower, Derek, Qualcomm
Technol Inc, Piscataway, NJ 08854 USA. Cain, Trey,
Qualcomm Datactr Technol Inc, Piscataway, NJ 08854
USA.",
author-email = "binhpham@rutgers.edu dhower@qti.qualcomm.com
abhib@rutgers.edu tcain@qti.qualcomm.com",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ARM many-core server; Benchmark testing; bloom
filters; cache flushes; cache storage; Coherence;
Computer architecture; design constraints; Hardware;
high-performance systems; Indexes; L1 virtual caches;
low-overhead; low-power many-core servers; low-power
virtually-addressed caches; microprocessor chips;
multi-threading; multicores; multiprocessing systems;
multiprogrammed workloads; multiprogramming;
multithreaded workloads; multithreading; power
efficiency; power savings; Registers; Servers; TLB; TLB
shootdown mitigation; Virtual Cache; virtual memory;
virtual page remappings",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Pham:2018:TSM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yavits:2018:ASM,
author = "Leonid Yavits and Ran Ginosar",
title = "Accelerator for Sparse Machine Learning",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2714667",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Sparse matrix by vector multiplication (SpMV) plays a
pivotal role in machine learning and data mining. We
propose and investigate an SpMV accelerator,
specifically designed to accelerate the sparse matrix
by sparse vector multiplication (SpMSpV), and to be
integrated in a CPU core. We show that our accelerator
outperforms a similar solution by 70x while achieving
8x higher power efficiency, which yields an estimated
29x energy reduction for SpMSpV based applications.",
acknowledgement = ack-nhfb,
affiliation = "Yavits, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
Yavits, Leonid; Ginosar, Ran, Technion Israel Inst
Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.",
author-email = "yavits@technion.ac.il ran@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerator; Algorithm design and
analysis; CPU core; data mining; Indexes; learning
(artificial intelligence); matrix multiplication;
Memory management; microprocessor chips; power aware
computing; power efficiency; Random access memory;
regression analysis; sparse machine learning; sparse
matrices; Sparse matrices; sparse matrix; sparse matrix
by sparse vector multiplication; Sparse matrix
multiplication; sparse vector multiplication; SpMSpV
based applications; SpMV; SpMV accelerator; tree
searching; vectors",
keywords-plus = "MATRIX-VECTOR MULTIPLICATION",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Yavits:2018:ASM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Christoforidis:2018:CTC,
author = "Eleftherios-Iordanis Christoforidis and Sotirios Xydis
and Dimitrios Soudris",
title = "{CF-TUNE}: Collaborative Filtering Auto-Tuning for
Energy Efficient Many-Core Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2716919",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Energy efficiency is considered today as a first class
design principle of modern many-core computing systems
in the effort to overcome the limited power envelope.
However, many-core processors are characterised by high
micro-architectural complexity, which is propagated up
to the application level affecting both performance and
energy consumption. In this paper, we present CF-TUNE,
an online and scalable auto-tuning framework for energy
aware applications mapping on emerging many-core
architectures. CF-TUNE enables the extraction of an
energy-efficient tuning configuration point with
minimal application characterisation on the whole
tuning configuration space. Instead of analyzing every
application against every tuning configuration, it
adopts a collaborative filtering technique that quickly
and with high accuracy configures the application's
tuning parameters by identifying similarities with
previously optimized applications. We evaluate
CF-TUNE's efficiency against a set of demanding and
diverse applications mapped on Intel Many Integrated
Core processor and we show that with minimal
characterization, e.g., only either two or four
evaluations, CF-TUNE recommends a tuning configuration
that performs at least at the 94 percent level of the
optimal one.",
acknowledgement = ack-nhfb,
affiliation = "Xydis, S (Reprint Author), Natl Tech Univ Athens, Sch
Elect \& Comp Engn, Zografos 15780, Greece.
Christoforidis, Eleftherios-Iordanis; Xydis, Sotirios;
Soudris, Dimitrios, Natl Tech Univ Athens, Sch Elect \&
Comp Engn, Zografos 15780, Greece.",
author-email = "eleftherios.christoforidis@gmail.com
sxydis@microlab.ntua.gr dsoudris@microlab.ntua.gr",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application level; application tuning parameters;
Auto-tuning; CF-TUNE efficiency; Collaboration;
collaborative filtering auto-tuning; Computer
architecture; design space exploration; energy aware
application mapping; energy conservation; energy
consumption; energy efficient computing; energy
efficient many-core processors; energy-efficient tuning
configuration point; Instruction sets; Intel many
integrated core processor; Intel MIC; machine learning;
many-core architectures; manycore architectures;
microarchitectural complexity; microprocessor chips;
Microwave integrated circuits; minimal application
characterisation; modern many-core computing systems;
multiprocessing systems; online auto-tuning framework;
Optimization; power aware computing; power envelope;
scalable auto-tuning framework; Tuning; tuning
configuration space",
number-of-cited-references = "15",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
times-cited = "0",
unique-id = "Christoforidis:2018:CTC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Almatrood:2018:DGP,
author = "Amjad F. Almatrood and Harpreet Singh",
title = "Design of Generalized Pipeline Cellular Array in
Quantum-Dot Cellular Automata",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2719021",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cellular arrays have been the topic of interest in
computer arithmetic and architecture for the last four
decades. In this letter, an overall quantum-dot
cellular automata (QCA) design for a generalized
pipeline cellular array is presented. QCA is one of the
promising emerging nanotechnologies that are being
considered as possible alternatives to complementary
metal-oxide semiconductor technology due to the
physical limitations of CMOS. The QCA designs for
arithmetic cell and control cell used in the pipeline
array are discussed in detail. The equivalent majority
logic networks to these cells are generated using the
best existing majority logic synthesis method in order
to obtain the optimal majority networks which require
fewer QCA cells and clock zones compared to other
synthesis methods. The proposed array can perform all
the basic arithmetic operations such as squaring,
square rooting, multiplication, division, etc., which
could be quite valuable in considering future
large-scale QCA designs.",
acknowledgement = ack-nhfb,
affiliation = "Almatrood, AF (Reprint Author), Wayne State Univ, Dept
Elect \& Comp Engn, Detroit, MI 48202 USA. Almatrood,
Amjad F.; Singh, Harpreet, Wayne State Univ, Dept Elect
\& Comp Engn, Detroit, MI 48202 USA.",
author-email = "amjad.almatrood@wayne.edu hsingh@eng.wayne.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "arithmetic cell; Arithmetic processor; cellular
arrays; cellular automata; clock zones; Clocks; clocks;
CMOS logic circuits; CMOS technology; complementary
metal-oxide semiconductor technology; computer
architecture; Computer architecture; computer
arithmetic; control cell; Delays; equivalent majority
logic networks; generalized pipeline cellular array
design; large-scale QCA designs; Logic arrays; logic
design; Logic gates; majority logic; majority logic
synthesis method; Microprocessors; nanoelectronics;
nanotechnologies; pipeline array; Pipelines;
quantum-dot cellular automata (QCA); quantum-dot
cellular automata design",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Almatrood:2018:DGP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zha:2018:CRC,
author = "Yue Zha and Jing Li",
title = "{CMA}: a Reconfigurable Complex Matching Accelerator
for Wire-Speed Network Intrusion Detection",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2719023",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The rapid growth in network bandwidth and the ever
more sophisticated network attack techniques pose
challenges to current network intrusion detection
systems (NIDS). While software-based solutions are
incapable of performing wire-speed network traffic
monitoring, many hardware-based pattern matching
solutions also suffer from capacity limitation and high
power consumption. To effectively address these
challenges, we propose a reconfigurable complex
matching accelerator (CMA) enabled by the emerging
nonvolatile memory technology (resistive random access
memory) to speed up intrusion detection systems with
better energy efficiency. Beyond common equality
matching in current NIDS, CMA can be configured to
provide a comprehensive set of arithmetic matching
functions (e.g., less than), resulting in improved
utilization and higher energy efficiency. We evaluate
CMA using real-world network security benchmarks. On
average, it achieves 84.9 percent area reduction, 97.3
percent energy consumption reduction, and 20 percent
improvement in searching speed compared to the
SRAM-based Ternary Content Addressable Memory (TCAM)
design in state-of-the-art NIDS. It also outperforms
emerging RRAM-based TCAM (2.5T1R) design in area,
energy and search delay, on the set of evaluated
workloads.",
acknowledgement = ack-nhfb,
affiliation = "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp
Engn, Madison, WI 53706 USA. Zha, Yue; Li, Jing, Univ
Wisconsin, Elect \& Comp Engn, Madison, WI 53706 USA.",
author-email = "yzha3@wisc.edu jli587@wisc.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator; arithmetic matching functions; CMA;
Computer architecture; computer network security;
computer networks; content-addressable storage;
Coprocessors; emerging nonvolatile memory technology;
Encoding; energy consumption reduction; higher energy
efficiency; intrusion detection; Intrusion detection;
IP networks; network bandwidth; network intrusion
detection systems; Network security; NIDS; pattern
matching; pattern matching solutions; Ports
(Computers); random-access storage; real-world network
security benchmarks; reconfigurable complex matching
accelerator; ReRAM; resistive random access memory;
security of data; sophisticated network attack
techniques; SRAM chips; TCAM; telecommunication
traffic; ternary content addressable memory design;
wire-speed network intrusion detection; wire-speed
network traffic monitoring",
keywords-plus = "PACKET CLASSIFICATION; MODEL",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zha:2018:CRC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jung:2018:SMS,
author = "Myoungsoo Jung and Jie Zhang and Ahmed Abulila and
Miryeong Kwon and Narges Shahidi and John Shalf and Nam
Sung Kim and Mahmut Kandemir",
title = "{SimpleSSD}: Modeling Solid State Drives for Holistic
System Simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "37--41",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2750658",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Existing solid state drive (SSD) simulators
unfortunately lack hardware and/or software
architecture models. Consequently, they are far from
capturing the critical features of contemporary SSD
devices. More importantly, while the performance of
modern systems that adopt SSDs can vary based on their
numerous internal design parameters and storage-level
configurations, a full system simulation with
traditional SSD models often requires unreasonably long
runtimes and excessive computational resources. In this
work, we propose SimpleSSD, a high-fidelity simulator
that models all detailed characteristics of hardware
and software, while simplifying the nondescript
features of storage internals. In contrast to existing
SSD simulators, SimpleSSD can easily be integrated into
publicly-available full system simulators. In addition,
it can accommodate a complete storage stack and
evaluate the performance of SSDs along with diverse
memory technologies and microarchitectures. Thus, it
facilitates simulations that explore the full design
space at different levels of system abstraction.",
acknowledgement = ack-nhfb,
affiliation = "Jung, M (Reprint Author), Yonsei Univ, Comp
Architecture \& Memory Syst Lab, Seoul 03722, South
Korea. Jung, Myoungsoo; Zhang, Jie; Kwon, Miryeong,
Yonsei Univ, Comp Architecture \& Memory Syst Lab,
Seoul 03722, South Korea. Abulila, Ahmed; Kim, Nam
Sung, Univ Illinois, Champaign, IL 61820 USA. Shahidi,
Narges; Kandemir, Mahmut, Penn State Univ, State Coll,
PA 16801 USA. Shalf, John, Lawrence Berkeley Natl Lab,
Berkeley, CA 94720 USA.",
author-email = "m.jung@yonsei.ac.kr jie@yonsei.ac.kr
abulila2@illinois.edu mkwon@camelab.org nxs314@psu.edu
jshalf@lbl.gov nskim@illinois.edu
kandemir@cse.psu.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NRF [2016R1C1B2015312]; Mem-Ray grant
[2015-11-1731]; US National Science Foundation
[1640196, 1439021, 1439057, 1409095, 1626251, 1629915,
1629129, 1526750]; SRC/NRC NERC [2016-NE-2697-A];
[IITP-2017-2017-0-01015]; [NRF-2015M3C4A7065645]; [DOE
DE-AC02-05CH 11231]",
funding-text = "This research is mainly supported by NRF
2016R1C1B2015312. This work is also supported in part
by IITP-2017-2017-0-01015, NRF-2015M3C4A7065645, DOE
DE-AC02-05CH 11231, and Mem-Ray grant (2015-11-1731).
Dr. Kim is supported in part by US National Science
Foundation 1640196 and SRC/NRC NERC 2016-NE-2697-A. Dr.
Kandemir is supported in part by US National Science
Foundation grants 1439021, 1439057, 1409095, 1626251,
1629915, 1629129 and 1526750.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "complete storage stack; computational modeling;
Computational modeling; computational modeling;
computer architecture; Computer architecture;
contemporary SSD devices; flash memories; Hardware;
high-fidelity simulator; internal design parameters;
microprocessors; Microprocessors; microprocessors;
nondescript features; parallel processing; Parallel
processing; parallel processing; publicly-available
full system simulators; SimpleSSD; software; Software;
software; solid state drive simulators; SSD simulators;
storage-level configurations; system abstraction;
system simulation; systems simulation; Systems
simulation; systems simulation",
number-of-cited-references = "14",
research-areas = "Computer Science",
researcherid-numbers = "Jung, Myoungsoo/F-4565-2019",
times-cited = "2",
unique-id = "Jung:2018:SMS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chowdhury:2018:EMP,
author = "Zamshed Chowdhury and Jonathan D. Harms and S. Karen
Khatamifard and Masoud Zabihi and Yang Lv and Andrew P.
Lyle and Sachin S. Sapatnekar and Ulya R. Karpuzcu and
Jian-Ping Wang",
title = "Efficient In-Memory Processing Using Spintronics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "42--46",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2751042",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As the overhead of data retrieval becomes forbidding,
bringing processor logic to the memory where the data
reside becomes more energy-efficient. While traditional
CMOS structures are unsuited to the tight integration
of logic and memory, emerging spintronic technologies
show remarkable versatility. This paper introduces a
novel spintronics-based processing-in-memory (PIM)
framework called computational RAM (CRAM) to solve
data-intensive computing problems.",
acknowledgement = ack-nhfb,
affiliation = "Chowdhury, Z (Reprint Author), Univ Minnesota, Dept
Elect \& Comp Engn, Minneapolis, MN 55455 USA.
Chowdhury, Zamshed; Harms, Jonathan D.; Khatamifard, S.
Karen; Zabihi, Masoud; Lv, Yang; Lyle, Andrew P.;
Sapatnekar, Sachin S.; Karpuzcu, Ulya R.; Wang,
Jian-Ping, Univ Minnesota, Dept Elect \& Comp Engn,
Minneapolis, MN 55455 USA.",
author-email = "chowh005@umn.edu harms074@umn.edu khatami@umn.edu
zabih003@umn.edu lvxxx057@umn.edu
czamshediqbal@gmail.com sachin@umn.edu ukarpuzc@umn.edu
jpwang@umn.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "DARPA Non-Volatile Logic program; NSF SPX
[1725420]; by C-SPIN, one of the six SRC STARnet
Centers; MARCO; DARPA",
funding-text = "This work is supported by DARPA Non-Volatile Logic
program, NSF SPX grant no. 1725420, and by C-SPIN, one
of the six SRC STARnet Centers, sponsored by MARCO and
DARPA. Chowdhury and Harms equally contributed to this
work.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adders; computational RAM; CRAM; data retrieval;
data-intensive computing problems; Efficient In-Memory
Processing; energy-efficiency; Logic arrays; Logic
gates; Magnetic tunneling; magnetoelectronics; Memory
management; MRAM devices; MTJ; PIM framework;
processing-in-memory; processing-in-memory framework;
processor logic; Random access memory; spintronic
technologies; spintronics; STT-MRAM; traditional CMOS
structures",
keywords-plus = "UNIVERSAL MEMORY; LOGIC",
number-of-cited-references = "25",
ORCID-numbers = "Sapatnekar, Sachin/0000-0002-5353-2364",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Chowdhury:2018:EMP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ajdari:2018:SHB,
author = "Mohammadamin Ajdari and Pyeongsu Park and Dongup Kwon
and Joonsung Kim and Jangwoo Kim",
title = "A Scalable {HW}-Based Inline Deduplication for {SSD}
Arrays",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "47--50",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2753258",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "SSD arrays are becoming popular in modern storage
servers as a primary storage, and they aim to reduce
the high cost of the devices by performing inline
deduplications. Unfortunately, existing software-based
inline deduplications cannot achieve the devices'
maximum throughput due to their high CPU utilization
and power overhead. A recently proposed approach to
perform device-wide deduplications inside each SSD can
distribute the CPU overhead among multiple SSDs, but it
also suffers from severely decreasing deduplication
opportunities with the increasing number of SSDs
deployed per node. Therefore, we propose a node-wide
deduplication engine that relies on specialized
hardware to perform two key steps of deduplication;
data signature generation and table management. Our
FPGA-based prototype detects all duplicates, and
compared to software-based inline deduplication, it
reduces the overall CPU utilization and power
consumption by 93.6 and similar to 20 percent
respectively for a slow baseline and more for faster
baselines.",
acknowledgement = ack-nhfb,
affiliation = "Kim, J (Reprint Author), Seoul Natl Univ, Dept Elect
\& Comp Engn, Seoul 08826, South Korea. Ajdari,
Mohammadamin, POSTECH, Dept Comp Sci \& Engn, Pohang
37673, South Korea. Park, Pyeongsu; Kwon, Dongup; Kim,
Joonsung; Kim, Jangwoo, Seoul Natl Univ, Dept Elect \&
Comp Engn, Seoul 08826, South Korea.",
author-email = "majdari@postech.ac.kr pyeongsu@snu.ac.kr
dongup@snu.ac.kr joonsung90@snu.ac.kr
jangwoo@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea (NRF) ---
Ministry of Science, ICT \& Future Planning
[NRF-2015M3C4A7065647, NRF-2017R1A2B3011038]; Institute
for Information \& communications Technology Promotion
(IITP) grant --- Korea government (MSIT)
[R0190-15-2012]",
funding-text = "This work was partly supported by Basic Science
Research Program through the National Research
Foundation of Korea (NRF) funded by the Ministry of
Science, ICT \& Future Planning (NRF-2015M3C4A7065647,
NRF-2017R1A2B3011038), and Institute for Information \&
communications Technology Promotion (IITP) grant funded
by the Korea government (MSIT) (No. R0190-15-2012).
Mohammadamin Ajdari and Pyeongsu Park contributed
equally to this work.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "CPU overhead; CPU utilization; data handling; data
integrity; deduplication; deduplication opportunities;
device-wide deduplications; Engines; field programmable
gate arrays; file servers; flash memories; FPGA;
FPGA-based prototype; Hardware; inline deduplication;
modern storage servers; node-wide deduplication engine;
Performance evaluation; power consumption; Power
demand; power overhead; primary storage; Random access
memory; Servers; software-based inline deduplications;
SSD; SSD arrays; storage management; Storage server;
Throughput",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Ajdari:2018:SHB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hoseinzadeh:2018:FBS,
author = "Morteza Hoseinzadeh",
title = "Flow-Based Simulation Methodology",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "51--54",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2756051",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents flow-based simulation, a new
methodology for evaluating novel and intricate computer
system designs. The main idea of flow-based simulation
is to keep the history of every simulated memory
element, instead of its latest value, to make it time
bonded so that sliding the time forward and backward
changes the state of the system accordingly. Having
this opportunity, new architectural designs can be
evaluated in terms of timing and energy by implementing
only a functional simulation. Due to serial execution,
the process of the design in a flow-based simulation is
traceable and easy to understand. As a result,
comparing with cycle-driven and event-driven
techniques, complicated algorithms can be evaluated
much easier. Flow-based simulation simplifies the
burden of the timing simulation, and consequently leads
to faster development and simulation time.",
acknowledgement = ack-nhfb,
affiliation = "Hoseinzadeh, M (Reprint Author), Univ Calif San Diego,
Dept Comp Sci \& Engn, La Jolla, CA 92093 USA.
Hoseinzadeh, Morteza, Univ Calif San Diego, Dept Comp
Sci \& Engn, La Jolla, CA 92093 USA.",
author-email = "mhoseinzadeh@cs.ucsd.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; Computer architectural
simulator; Concurrent computing; cycle-driven
techniques; digital simulation; event-driven
techniques; flow-based simulation; flow-based
simulation methodology; functional simulation; History;
Integrated circuit modeling; Interference; intricate
computer system designs; simulated memory element;
simulation methodologies; Timing; timing simulation;
Tools",
keywords-plus = "FULL-SYSTEM",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hoseinzadeh:2018:FBS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Eyerman:2018:MSC,
author = "Stijn Eyerman and Wim Heirman and Kristof {Du Bois}
and Ibrahim Hur",
title = "Multi-Stage {CPI} Stacks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "55--58",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2761751",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "CPI stacks are an intuitive way to visualize processor
core performance bottlenecks. However, they often do
not provide a full view on all bottlenecks, because
stall events can occur concurrently. Typically one of
the events is selected, which means information about
the non-chosen stall events is lost. Furthermore, we
show that there is no single correct CPI stack: stall
penalties can be hidden, can overlap or can cause
second-order effects, making total CPI more complex
than just a sum of components. Instead of showing a
single CPI stack, we propose to measure multiple CPI
stacks during program execution: a CPI stack at each
stage of the processor pipeline. This representation
reveals all performance bottlenecks and provides a more
complete view on the performance of an application.
Multi-stage CPI stacks are easy to collect, which means
that they can be included in a simulator with
negligible slowdown, and that they can be included in
the core hardware with limited overhead.",
acknowledgement = ack-nhfb,
affiliation = "Eyerman, S (Reprint Author), Intel Corp, Santa Clara,
CA 95054 USA. Eyerman, Stijn; Heirman, Wim; Du Bois,
Kristof; Hur, Ibrahim, Intel Corp, Santa Clara, CA
95054 USA.",
author-email = "Stijn.Eyerman@intel.com Wim.Heirman@intel.com
Kristof.Du.Bois@intel.com Ibrahim.Hur@intel.com",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "1/f noise; Additives; CPI stacks; Hardware;
microprocessor chips; multiple CPI stacks; multistage
CPI stacks; Performance analysis; performance counters;
performance evaluation; Performance gain; pipeline
processing; Pipelines; processor core performance
bottlenecks; processor pipeline; program execution;
Proposals; Radiation detectors; single correct CPI
stack; stall events; stall penalties; total CPI",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Eyerman:2018:MSC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhang:2018:LHC,
author = "Guowei Zhang and Daniel Sanchez",
title = "Leveraging Hardware Caches for Memoization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "59--63",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2762308",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memoization improves performance and saves energy by
caching and reusing the outputs of repetitive
computations. Prior work has proposed software and
hardware memoization techniques, but both have
significant drawbacks. Software memoization suffers
from high runtime overheads, and is thus limited to
long computations. Conventional hardware memoization
techniques achieve low overheads and can memoize short
functions, but they rely on large, special-purpose
memoization caches that waste significant area and
energy. We propose MCACHE, a hardware technique that
leverages data caches for memoization. MCACHE stores
memoization tables in memory, and allows them to share
cache capacity with normal program data. MCACHE
introduces ISA and pipeline extensions to accelerate
memoization operations, bridging the gap between
software and conventional hardware techniques.
Simulation results show that MCACHE improves
performance by up to 21 x, outperforms software
memoization by up to 2.2 x, and achieves similar or
superior performance over conventional hardware
techniques without any dedicated storage.",
acknowledgement = ack-nhfb,
affiliation = "Sanchez, D (Reprint Author), MIT CSAIL, Cambridge, MA
02139 USA. Zhang, Guowei; Sanchez, Daniel, MIT CSAIL,
Cambridge, MA 02139 USA.",
author-email = "zhanggw@csail.mit.edu sanchez@csail.mit.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "C-FAR, one of six SRC STAR-net centers by
MARCO; C-FAR, one of six SRC STAR-net centers by DARPA;
NSF [CAREER-1452994]",
funding-text = "This work was supported in part by C-FAR, one of six
SRC STAR-net centers by MARCO and DARPA, and by NSF
grant CAREER-1452994.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Benchmark testing; cache capacity; cache
storage; caches; Computer architecture; data caches;
energy by caching; Hardware; hardware caches; Indexes;
MCACHE; memoization; memoization operations;
memoization tables; memory systems; power aware
computing; Registers; runtime overheads; Semantics;
Software; software memoization suffers; special-purpose
memoization caches",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zhang:2018:LHC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Vakil-Ghahani:2018:CRP,
author = "Armin Vakil-Ghahani and Sara Mahdizadeh-Shahri and
Mohammad-Reza Lotfi-Namin and Mohammad Bakhshalipour
and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad",
title = "Cache Replacement Policy Based on Expected Hit Count",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "64--67",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2762660",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory-intensive workloads operate on massive amounts
of data that cannot be captured by last-level caches
(LLCs) of modern processors. Consequently, processors
encounter frequent off-chip misses, and hence, lose
significant performance potential. One of the
components of a modern processor that has a prominent
influence on the off-chip miss traffic is LLC's
replacement policy. Existing processors employ a
variation of least recently used (LRU) policy to
determine the victim for replacement. Unfortunately,
there is a large gap between what LRU offers and that
of Belady's MIN, which is the optimal replacement
policy. Belady's MIN requires selecting a victim with
the longest reuse distance, and hence, is unfeasible
due to the need for knowing the future. In this work,
we observe that there exists a strong correlation
between the expected number of hits of a cache block
and the reciprocal of its reuse distance. Taking
advantage of this observation, we improve the
efficiency of last-level caches through a
low-cost-yet-effective replacement policy. We suggest a
hit-count based victim-selection procedure on top of
existing low-cost replacement policies to significantly
improve the quality of victim selection in last-level
caches without commensurate area overhead. Our proposal
offers 12.2 percent performance improvement over the
baseline LRU in a multi-core processor and outperforms
EVA, which is the state-of-the-art replacement
policy.",
acknowledgement = ack-nhfb,
affiliation = "Bakhshalipour, M (Reprint Author), Sharif Univ
Technol, Dept Comp Engn, Tehran 1115511365, Iran.
Vakil-Ghahani, Armin; Mahdizadeh-Shahri, Sara;
Lotfi-Namin, Mohammad-Reza; Bakhshalipour, Mohammad;
Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp
Engn, Tehran 1115511365, Iran. Lotfi-Kamran, Pejman;
Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
Comp Sci, Tehran 1953833511, Iran.",
author-email = "vakil@ce.sharif.edu smahdizadeh@ce.sharif.edu
mrlotfi@ce.sharif.edu bakhshalipour@ce.sharif.edu
plotfi@ipm.ir azad@sharif.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Belady's MIN; cache block; cache replacement policy;
cache storage; Correlation; expected hit count;
History; hit-count based victim-selection procedure;
last-level cache; last-level caches; longest reuse
distance; low-cost replacement policies;
low-cost-yet-effective replacement policy; Memory
system; memory-intensive workload; memory-intensive
workloads; Multicore processing; multicore processor;
multiprocessing systems; off-chip miss traffic;
off-chip misses; optimal replacement policy;
performance evaluation; performance improvement;
Prefetching; Proposals; Radiation detectors;
replacement policy; victim selection",
keywords-plus = "PREDICTION",
number-of-cited-references = "16",
ORCID-numbers = "Vakil Ghahani, Seyed Armin/0000-0002-4365-8932",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Vakil-Ghahani:2018:CRP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hadjilambrou:2018:SCV,
author = "Zacharias Hadjilambrou and Shidhartha Das and Marco A.
Antoniades and Yiannakis Sazeides",
title = "Sensing {CPU} Voltage Noise Through Electromagnetic
Emanations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "68--71",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2766221",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This work proposes sensing CPU voltage noise through
wireless electromagnetic (EM) emanations from the CPU.
Compared to previous voltage monitoring methodologies,
this approach is not intrusive as it does not require
direct physical access to the monitored CPU. To prove
the effectiveness of this approach, we use EM signal
feedback to find the resonant frequency of the CPU
power delivery network, and to generate a CPU voltage
noise (dI/dt) virus. This study is performed on a
modern out-of-order CPU that supports on-chip fine
grain voltage monitoring. This on-chip voltage
monitoring capability is used to validate the proposed
EM methodology.",
acknowledgement = ack-nhfb,
affiliation = "Hadjilambrou, Z (Reprint Author), Univ Cyprus, CY-1678
Nicosia, Cyprus. Hadjilambrou, Zacharias; Antoniades,
Marco A.; Sazeides, Yiannakis, Univ Cyprus, CY-1678
Nicosia, Cyprus. Das, Shidhartha, ARM, Cambridge CB1
9NJ, England.",
author-email = "zhadji01@cs.ucy.ac.cy Shidhartha.Das@arm.com
mantonia@ucy.ac.cy yanos@cs.ucy.ac.cy",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Union Horizon 2020 project
Uniserver [688540]; University of Cyprus",
funding-text = "This work is partially supported by European Union
Horizon 2020 project Uniserver grant no. 688540 and the
University of Cyprus.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "circuit resonance; CPU power delivery network; CPU
voltage noise virus; electromagnetic emanations; EM
signal feedback; Frequency measurement; Genetic
algorithms; Hardware reliability; microprocessor chips;
Monitoring; on-chip fine grain voltage monitoring;
on-chip voltage monitoring capability; Resonant
frequency; RLC circuits; Stress; stress tests;
System-on-chip; voltage noise; voltage regulators;
wireless electromagnetic emanations",
number-of-cited-references = "19",
ORCID-numbers = "Antoniades, Marco/0000-0002-9699-2387",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Hadjilambrou:2018:SCV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Jung:2018:PCU,
author = "Daejin Jung and Sunjung Lee and Wonjong Rhee and Jung
Ho Ahn",
title = "Partitioning Compute Units in {CNN} Acceleration for
Statistical Memory Traffic Shaping",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "72--75",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2773055",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Convolutional Neural Networks (CNNs) have become the
default choice for processing visual information, and
the design complexity of CNNs has been steadily
increasing to improve accuracy. To cope with the
massive amount of computation needed for such complex
CNNs, the latest solutions utilize blocking of an image
over the available dimensions (e.g., horizontal,
vertical, channel, and kernel) and batching of multiple
input images to improve data reuse in the memory
hierarchy. While there has been a large collection of
works on maximizing data reuse, only a few studies have
focused on the memory bottleneck problem caused by
limited bandwidth. Bandwidth bottleneck can easily
occur in CNN acceleration as CNN layers have different
sizes with varying computation needs and as batching is
typically performed over each layer of CNN for an ideal
data reuse. In this case, the data transfer demand for
a layer can be relatively low or high compared to the
computation requirement of the layer, and therefore
temporal fluctuations in memory access can be induced
eventually causing bandwidth problems. In this paper,
we first show that there exists a high degree of
fluctuation in memory access to computation ratio
depending on CNN layers and functions in the layer
being processed by the compute units (cores), where the
compute units are tightly synchronized to maximize data
reuse. Then we propose a strategy of partitioning the
compute units where the cores within each partition
process a batch of input data in a synchronous manner
to maximize data reuse but different partitions run
asynchronously. Because the partitions stay
asynchronous and typically process different CNN layers
at any given moment, the memory access traffic sizes of
the partitions become statistically shuffled. Thus, the
partitioning of compute units and asynchronous use of
them make the total memory access traffic size be
smoothened over time, and the degree of partitioning
determines a tradeoff between data reuse efficiency and
memory bandwidth utilization efficiency. We call this
smoothing statistical memory traffic shaping, and we
show that it can lead to 8.0 percent of performance
gain on a commercial 64-core processor when running
ResNet-50.",
acknowledgement = ack-nhfb,
affiliation = "Rhee, W; Ahn, JH (Reprint Author), Seoul Natl Univ,
Dept Transdisciplinary Studies, Seoul 151742, South
Korea. Jung, Daejin; Lee, Sunjung; Rhee, Wonjong; Ahn,
Jung Ho, Seoul Natl Univ, Dept Transdisciplinary
Studies, Seoul 151742, South Korea.",
author-email = "haijd@snu.ac.kr shiish@snu.ac.kr wrhee@snu.ac.kr
gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea grant
--- Korea government [NRF-2017R1A2B2005416,
NRF-2017R1E1A1A03070560]",
funding-text = "This work was partially supported by the National
Research Foundation of Korea grant funded by the Korea
government (NRF-2017R1A2B2005416 and
NRF-2017R1E1A1A03070560).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Bandwidth; bandwidth bottleneck;
bandwidth problems; CNN; CNN acceleration; CNN layers;
complex CNNs; comput units; computation requirement;
Computational modeling; Computer architecture;
Convolution; convolutional neural networks; data
transfer demand; horizontal channel; ideal data reuse;
image processing; Kernel; maximize data reuse; memory
access traffic sizes; memory bandwidth utilization
efficiency; memory bottleneck; memory bottleneck
problem; memory hierarchy; microprocessor chips;
multiprocessing systems; neural nets; Neural networks;
parallel processing; partitioning; partitioning compute
units; smoothing statistical memory traffic shaping;
traffic shaping; vertical channel",
number-of-cited-references = "16",
oa = "Bronze",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394 Rhee,
Wonjong/0000-0002-2590-8774",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Jung:2018:PCU",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{SanMiguel:2018:EMA,
author = "Joshua {San Miguel} and Karthik Ganesan and Mario Badr
and Natalie {Enright Jerger}",
title = "The {EH} Model: Analytical Exploration of
Energy-Harvesting Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "76--79",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2777834",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Energy-harvesting devices-which operate solely on
energy collected from their environment-have brought
forth a new paradigm of intermittent computing. These
devices succumb to frequent power outages that would
cause conventional systems to be stuck in a perpetual
loop of restarting computation and never making
progress. Ensuring forward progress in an intermittent
execution model is difficult and requires saving state
in non-volatile memory. In this work, we propose the EH
model to explore the trade-offs associated with backing
up data to maximize forward progress. In particular, we
focus on the relationship between energy and forward
progress and how they are impacted by backups/restores
to derive insights for programmers and architects.",
acknowledgement = ack-nhfb,
affiliation = "San Miguel, J (Reprint Author), Univ Toronto, Edward S
Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S,
Canada. San Miguel, Joshua; Ganesan, Karthik; Badr,
Mario; Jerger, Natalie Enright, Univ Toronto, Edward S
Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S,
Canada.",
author-email = "joshua.sanmiguel@mail.utoronto.ca
karthik.ganesan@mail.utoronto.ca
mario.badr@mail.utoronto.ca enright@ece.utoronto.ca",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "analytical exploration; analytical model; Analytical
models; Computational modeling; Computer architecture;
conventional systems; EH model; energy harvesting;
Energy-harvesting; energy-harvesting architectures;
energy-harvesting devices; forward progress; frequent
power outages; intermittent computing; intermittent
execution model; Mathematical model; Nonvolatile
memory; nonvolatile memory; perpetual loop; power aware
computing; Power system reliability; random-access
storage",
number-of-cited-references = "11",
ORCID-numbers = "Ganesan, Karthik/0000-0002-2541-1549",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Miguel:2018:EMA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2018:SPM,
author = "Jihun Kim and Joonsung Kim and Pyeongsu Park and Jong
Kim and Jangwoo Kim",
title = "{SSD} Performance Modeling Using Bottleneck Analysis",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "80--83",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2779122",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Solid-State Drives (SSDs) are widely deployed for high
throughput and low latency. However, the unpredictable
access latency of SSDs makes it difficult to satisfy
quality-of-service requirements and fully achieve the
performance potential. In fact, it has been a
fundamental challenge to accurately predict the access
latency of modern SSDs performing many non-disclosed,
device-specific intra-SSD optimizations. In this paper,
we propose SSDcheck, a novel SSD performance model
which accurately predicts the latency of future SSD
accesses. After first identifying write buffer (WB) and
garbage collection (GC) as the key components in
modeling the access latency, we develop diagnosis
snippets to identify the target SSDs critical intra-SSD
parameters (e.g., WB size). Finally, we construct the
SSDs access-latency model with the identified
parameters. Our system-level evaluations using five
commodity SSDs show that SSDcheck achieves up to 93
percent prediction accuracy. Our real-world prototype
applying an SSDcheck-aware system-level request
scheduling can significantly improve both throughput
and tail latency by up to 2.1x and 1.46x,
respectively.",
acknowledgement = ack-nhfb,
affiliation = "Kim, J (Reprint Author), Seoul Natl Univ, Dept Elect
\& Comp Engn, Seoul 151742, South Korea. Kim, Jihun;
Kim, Jong, POSTECH, Dept Comp Sci \& Engn, Pohang
37673, Gyeongbuk, South Korea. Kim, Joonsung; Park,
Pyeongsu; Kim, Jangwoo, Seoul Natl Univ, Dept Elect \&
Comp Engn, Seoul 151742, South Korea.",
author-email = "jihun735@postech.ac.kr jkim@postech.ac.kr
pyeongsu@snu.ac.kr joonsung90@snu.ac.kr
jangwoo@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea (NRF) ---
Ministry of Science, ICT \& Future Planning
[NRF-2015M3C4A7065647, NRF-2017R1A2B3011038]; Institute
for Information \& communications Technology Promotion
(IITP) grant --- Korea government (MSIT)
[R0190-15-2012]",
funding-text = "This work was partly supported by Basic Science
Research Program through the National Research
Foundation of Korea (NRF) funded by the Ministry of
Science, ICT \& Future Planning (NRF-2015M3C4A7065647,
NRF-2017R1A2B3011038), and Institute for Information \&
communications Technology Promotion (IITP) grant funded
by the Korea government (MSIT) (No. R0190-15-2012).
Jihun Kim and Joonsung Kim are contributed equally to
this work.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bottleneck analysis; cache storage; commodity SSD;
critical intra-SSD parameters; device-specific intraSSD
optimizations; Engines; Feature extraction; flash
memories; future SSD accesses; garbage collection;
identified parameters; Interference; Monitoring;
Predictive models; quality-of-service requirements;
Resource management; scheduling; solid-state drives;
SSD access-latency model; SSD check-aware system-level
request scheduling; SSD performance model; SSD
performance modeling; storage management; Throughput;
unpredictable access latency",
number-of-cited-references = "10",
ORCID-numbers = "Kim, Jihun/0000-0001-8893-8447",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2018:SPM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Angstadt:2018:MOS,
author = "Kevin Angstadt and Jack Wadden and Vinh Dang and Ted
Xie and Dan Kramp and Westley Weimer and Mircea Stan
and Kevin Skadron",
title = "{MNCaRT}: an Open-Source, Multi-Architecture
Automata-Processing Research and Execution Ecosystem",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "84--87",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2780105",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present MNCaRT, a comprehensive software ecosystem
for the study and use of automata processing across
hardware platforms. Tool support includes manipulation
of automata, execution of complex machines, high-speed
processing of NFAs and DFAs, and compilation of regular
expressions. We provide engines to execute automata on
CPUs (with VASim and Intel Hyperscan), GPUs (with
custom DFA and NFA engines), and FPGAs (with an HDL
translator). We also introduce MNRL, an open-source,
general-purpose and extensible state machine
representation language developed to support MNCaRT.
The representation is flexible enough to support
traditional finite automata (NFAs, DFAs) while also
supporting more complex machines, such as those which
propagate multi-bit signals between processing
elements. We hope that our ecosystem and representation
language stimulates new efforts to develop efficient
and specialized automata processing applications.",
acknowledgement = ack-nhfb,
affiliation = "Angstadt, K (Reprint Author), Univ Michigan, Comp Sci
\& Engn Div, Dept Elect Engn \& Comp Sci, Ann Arbor, MI
48109 USA. Angstadt, Kevin; Weimer, Westley, Univ
Michigan, Comp Sci \& Engn Div, Dept Elect Engn \& Comp
Sci, Ann Arbor, MI 48109 USA. Wadden, Jack; Dang, Vinh;
Xie, Ted; Kramp, Dan; Stan, Mircea; Skadron, Kevin,
Univ Virginia, Dept Comp Sci, Charlottesville, VA 22904
USA.",
author-email = "angstadt@umich.edu wadden@virginia.edu
vqd8a@virginia.edu ted.xie@virginia.edu
dankramp@virginia.edu weimerw@umich.edu
mircea@virginia.edu skadron@virginia.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[CCF-1116673, CCF-1629450, CCF-1619123, CNS-1619098];
AFRL [FA8750-15-2-0075]; Jefferson Scholars Foundation;
Achievement Rewards for College Scientists (ARCS)
Foundation; Xilinx; C-FAR, one of six centers of
STARnet; Semiconductor Research Corporation program -
MARCO; DARPA",
funding-text = "This work was supported in part by grants from the US
National Science Foundation (CCF-1116673, CCF-1629450,
CCF-1619123, CNS-1619098), AFRL (FA8750-15-2-0075),
Jefferson Scholars Foundation, Achievement Rewards for
College Scientists (ARCS) Foundation, a grant from
Xilinx, and support from C-FAR, one of six centers of
STARnet, a Semiconductor Research Corporation program
sponsored by MARCO and DARPA. Any opinions, findings
and conclusions or recommendations expressed in this
material are those of the authors and do not
necessarily reflect the views of AFRL.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator architectures; Automata; Benchmark
testing; complex machines; comprehensive software
ecosystem; DFA; Ecosystems; efficient automata
processing applications; Engines; extensible state
machine representation language; Field programmable
gate arrays; field programmable gate arrays; finite
automata; finite state machines; formal languages;
hardware platforms; high-speed processing; Intel
Hyperscan; MNCaRT; NFA engines; open source software;
Open source software; open source software;
open-source-multiarchitecture automata-processing
research; software tools; specialized automata
processing applications; Tools; traditional finite
automata",
number-of-cited-references = "21",
ORCID-numbers = "Angstadt, Kevin/0000-0002-0104-5257",
research-areas = "Computer Science",
researcherid-numbers = "Stan, Mircea/L-6219-2019",
times-cited = "2",
unique-id = "Angstadt:2018:MOS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zheng:2018:EPE,
author = "Hao Zheng and Ahmed Louri",
title = "{EZ-Pass}: an Energy \& Performance-Efficient
Power-Gating Router Architecture for Scalable {NoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "88--91",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2783918",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "With technology scaling into nanometer regime, static
power is becoming the dominant factor in the overall
power consumption of Network-on-Chips (NoCs). Static
power can be reduced by powering off routers during
consecutive idle time through power-gating techniques.
However, power-gating techniques suffer from a large
wake-up latency to wake up the powered-off routers.
Recent research aims to improve the wake-up latency
penalty by hiding it through early wake-up techniques.
However, these techniques do not exploit the full
advantage of power-gating due to the early wake-up.
Consequently, they do not achieve significant power
savings. In this paper, we propose an architecture
called Easy Pass (EZ-Pass) router that remedies the
large wake-up latency overheads while providing
significant static power savings. The proposed
architecture takes advantage of idle resources in the
network interface to transmit packets without waking up
the router. Additionally, the technique hides the
wake-up latency by continuing to provide packet
transmission during the wake-up phase. We use full
system simulation to evaluate our EZ-Pass router on a
64-core NoC with a mesh topology using PARSEC benchmark
suites. Our results show that the proposed router
reduces static power by up to 31 percent and overall
network latency by up to 32 percent as compared to
early-wakeup optimized power-gating techniques.",
acknowledgement = ack-nhfb,
affiliation = "Zheng, H (Reprint Author), George Washington Univ,
Dept Elect \& Comp Engn, Washington, DC 20052 USA.
Zheng, Hao; Louri, Ahmed, George Washington Univ, Dept
Elect \& Comp Engn, Washington, DC 20052 USA.",
author-email = "haozheng@gwu.edu louri@gwu.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; early-wakeup optimized
power-gating techniques; easy pass router; energy
conservation; energy-efficient; energy-efficient
power-gating router architecture; EZ-Pass router;
Latches; mesh topology; network interface; network
routing; network-on-chip; network-on-chips;
nework-on-chips; Nickel; NoC; PARSEC benchmark suites;
performance-efficient power-gating router architecture;
Ports (Computers); power consumption; Power-gating;
Routing; Routing protocols; scalable NoCs; static power
savings; Switches; wake-up latency overheads; wake-up
latency penalty; wake-up phase",
keywords-plus = "ON-CHIP",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zheng:2018:EPE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Delshadtehrani:2018:NPM,
author = "Leila Delshadtehrani and Schuyler Eldridge and
Sadullah Canakci and Manuel Egele and Ajay Joshi",
title = "{Nile}: a Programmable Monitoring Coprocessor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "92--95",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2784416",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Researchers widely employ hardware performance
counters (HPCs) as well as debugging and profiling
tools in processors for monitoring different events
such as cache hits, cache misses, and branch prediction
statistics during the execution of programs. The
collected information can be used for power,
performance, and thermal management of the system as
well as detecting anomalies or malicious behavior in
the software. However, monitoring new or complex events
using HPCs and existing tools is a challenging task
because HPCs only provide a fixed pool of raw events to
monitor. To address this challenge, we propose the
implementation of a programmable hardware monitor in a
complete system framework including the hardware
monitor architecture and its interface with an in-order
single-issue RISC-V processor as well as an operating
system. As a proof of concept, we demonstrate how to
programmatically implement a shadow stack using our
hardware monitor and how the programmed shadow stack
detects stack buffer overflow attacks. Our hardware
monitor design incurs a 26 percent power overhead and a
15 percent area overhead over an unmodified RISC-V
processor. Our programmed shadow stack has less than 3
percent performance overhead in the worst case.",
acknowledgement = ack-nhfb,
affiliation = "Delshadtehrani, L (Reprint Author), Boston Univ, Dept
Elect \& Comp Engn, Boston, MA 02215 USA.
Delshadtehrani, Leila; Eldridge, Schuyler; Canakci,
Sadullah; Egele, Manuel; Joshi, Ajay, Boston Univ, Dept
Elect \& Comp Engn, Boston, MA 02215 USA.",
author-email = "delshad@bu.edu schuye@bu.edu scanakci@bu.edu
megele@bu.edu joshi@bu.edu",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-1533663]",
funding-text = "We thank Prof. Jonathan Appavoo for providing
invaluable help in designing the OS support and the
software interface for Nile. This work was supported in
part by NSF grant CCF-1533663.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "branch prediction statistics; cache hits; cache
misses; cache storage; complete system framework;
complex events; coprocessors; Coprocessors; debugging;
fixed pool; Hardware; Hardware coprocessor; hardware
monitor architecture; hardware monitor design; hardware
performance counters; HPCs; Linux; malicious behavior;
Monitoring; Nile; operating system; operating systems
(computers); Pattern matching; performance evaluation;
performance overhead; power overhead; profiling tools;
Program processors; programmable hardware; programmable
hardware monitor; programmable monitoring coprocessor;
programmed shadow stack; raw events; reduced
instruction set computing; Rockets; security; shadow
stack; single-issue RISC-V processor; stack buffer
overflow attack; stack buffer overflow attacks; thermal
management; unmodified RISC-V processor",
number-of-cited-references = "17",
ORCID-numbers = "Joshi, AJay/0000-0002-3256-9942",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Delshadtehrani:2018:NPM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lee:2018:TTW,
author = "Eojin Lee and Sukhan Lee and G. Edward Suh and Jung Ho
Ahn",
title = "{TWiCe}: Time Window Counter Based Row Refresh to
Prevent Row-Hammering",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "1",
pages = "96--99",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2787674",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Computer systems using DRAM are exposed to
row-hammering attacks, which can flip data in a DRAM
row without directly accessing a row but by frequently
activating its adjacent ones. There have been a number
of proposals to prevent row-hammering, but they either
incur large area/performance overhead or provide
probabilistic protection. In this paper, we propose a
new row-hammering mitigation mechanism named Time
Window Counter based row refresh (TWiCe) which prevents
row-hammering by using a small number of counters
without performance overhead. We first make a key
observation that the number of rows that can cause
flipping their adjacent ones (aggressor candidates) is
limited by the maximum values of row activation
frequency and DRAM cell retention time. TWiCe exploits
this limit to reduce the required number of counter
entries by counting only actually activated DRAM rows
and periodically invalidating the entries that are not
activated frequently enough to be an aggressor. We
calculate the maximum number of required counter
entries per DRAM bank, with which row-hammering
prevention is guaranteed. We further improve energy
efficiency by adopting a pseudo-associative cache
design to TWiCe. Our analysis shows that TWiCe incurs
no performance overhead on normal DRAM operations and
less than 0.7 percent area and energy overheads over
contemporary DRAM devices.",
acknowledgement = ack-nhfb,
affiliation = "Lee, E; Ahn, JH (Reprint Author), Seoul Natl Univ,
Seoul 151742, South Korea. Lee, Eojin; Lee, Sukhan;
Ahn, Jung Ho, Seoul Natl Univ, Seoul 151742, South
Korea. Suh, G. Edward, Cornell Univ, Ithaca, NY 14850
USA.",
author-email = "yohoyo@snu.ac.kr infy1026@snu.ac.kr
suh@csl.cornell.edu gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NRF of Korea [NRF-2017R1A2B2005416]; R\&D
program of MOTIE/KEIT [10077609]; IDEC (EDA tool)",
funding-text = "This work was partially supported by the NRF of Korea
grant (NRF-2017R1A2B2005416), by the R\&D program of
MOTIE/KEIT (10077609), and by IDEC (EDA tool).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Computer architecture; DRAM; DRAM cell
retention time; DRAM chips; DRAM row; energy
efficiency; Microprocessors; Monitoring; performance
overhead; Probabilistic logic; pseudoassociative cache
design; Random access memory; refresh; reliability; row
activation frequency; row-hammering; row-hammering
attacks; row-hammering mitigation mechanism;
row-hammering prevention; time window counter based row
refresh; Time-frequency analysis; TWiCe",
keywords-plus = "MEMORY",
number-of-cited-references = "15",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394 Suh,
Edward/0000-0001-6409-9888",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Lee:2018:TTW",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rakshit:2018:LLO,
author = "Joydeep Rakshit and Kartik Mohanram",
title = "{LEO}: Low Overhead Encryption {ORAM} for Non-Volatile
Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "100--104",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2795621",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Data confidentiality attacks utilizing memory access
patterns threaten exposure of data in modern main
memories. Oblivious RAM (ORAM) is an effective
cryptographic primitive developed to thwart
access-pattern-based attacks in DRAM-based systems.
However, in emerging non-volatile memory (NVM) systems,
the increased writes due to encryption of multiple data
blocks on every Path ORAM (state-of-the-art efficient
ORAM) access impose significant energy, lifetime, and
performance overheads. LEO (Low overhead Encryption
ORAM) is an efficient Path ORAM encryption architecture
that addresses the high write overheads of ORAM
integration in NVMs, while providing security
equivalent to the baseline Path ORAM. LEO reduces NVM
cell writes by securely decreasing the number of block
encryptions during the write phase of a Path ORAM
access. LEO uses a secure, two-level counter mode
encryption framework that opportunistically eliminates
re-encryption of unmodified blocks, reducing NVM
writes. Our evaluations show that on average, LEO
decreases NVM energy by 60 percent, improves lifetime
by 1.51 x, and increases performance by 9 percent over
the baseline Path ORAM.",
acknowledgement = ack-nhfb,
affiliation = "Rakshit, J (Reprint Author), Univ Pittsburgh, Dept
Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Rakshit,
Joydeep; Mohanram, Kartik, Univ Pittsburgh, Dept Elect
\& Comp Engn, Pittsburgh, PA 15260 USA.",
author-email = "joydeep.rakshit@pitt.edu kmram@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "baseline path ORAM; block encryptions; Computer
architecture; cryptography; data confidentiality
attacks; DRAM chips; efficient path ORAM encryption
architecture; emerging nonvolatile memory systems;
Encryption; LEO; low-overhead encryption ORAM; memory
access patterns; memory security; multiple data blocks;
non-volatile memory; nonvolatile memories; Nonvolatile
memory; NVM; Oblivious RAM; ORAM integration; path ORAM
access; Random access memory; random-access storage;
System-on-chip; two-level counter mode encryption
framework",
number-of-cited-references = "21",
ORCID-numbers = "Rakshit, Joydeep/0000-0002-3670-4814",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Rakshit:2018:LLO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Do:2018:CRL,
author = "Sang Wook Stephen Do and Michel Dubois",
title = "Core Reliability: Leveraging Hardware Transactional
Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2791433",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern microprocessors are more vulnerable to
transient faults or soft errors than ever before due to
design trends mandating low supply voltage and reduced
noise margins, shrinking feature sizes and increased
transistor density for fast, low-power circuits. As
industry now supports Hardware Transactional Memory
(HTM), the features of HTM can be leveraged to add core
resiliency to transient errors. In this paper, we
propose a novel microarchitecture for transient error
detection and recovery based on time redundancy and
backward error recovery leveraging HTM's existing
features especially its rollback mechanism. We provide
implementation details for single-core reliability,
minimizing additions to existing HTM supports. We
evaluate the performance overheads of the single core
with the reliability feature by comparing it to the
base machine without the reliability feature. Finally
we show how single-core reliability can be extended to
multi-core reliability.",
acknowledgement = ack-nhfb,
affiliation = "Do, SWS (Reprint Author), Univ Southern Calif, Dept
Elect Engn, EEB200, Elect Engn Bldg, Los Angeles, CA
90089 USA. Do, Sang Wook Stephen, Univ Southern Calif,
Dept Elect Engn, EEB200, Elect Engn Bldg, Los Angeles,
CA 90089 USA. Dubois, Michel, Univ Southern Calif, Dept
Elect Engn, EEB228, Elect Engn Bldg, Los Angeles, CA
90089 USA.",
author-email = "sdo@usc.edu dubois@usc.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0954211]",
funding-text = "The authors wish to thank Daniel Wong at UC Riverside
for advice on setting up the SPEC 2006 benchmark suite.
This material is based upon work supported by the
National Science Foundation under Grant CCF-0954211.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "backward error recovery; computer system organization;
core resiliency; design trends; Electrical engineering;
error detection; feature sizes; Fingerprint
recognition; Hardware; hardware transactional memory;
Hardware Transactional Memory; hardware transactional
memory; HTM; integrated circuit design; integrated
circuit reliability; low supply voltage; low-power
circuits; low-power electronics; memory architecture;
microprocessor chips; modern microprocessors; Multicore
processing; multicore reliability; noise margins;
performance and reliability; Registers; Reliability;
rollback mechanism; single-core reliability; soft
errors; time redundancy; Transient analysis; transient
error detection; transient error recovery; transient
faults; transistor density",
keywords-plus = "TRANSIENT-FAULT RECOVERY; MULTIPROCESSORS;
CONSISTENCY; SUPPORT",
number-of-cited-references = "30",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Do:2018:CRL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kaliorakis:2018:SAM,
author = "Manolis Kaliorakis and Athanasios Chatzidimitriou and
George Papadimitriou and Dimitris Gizopoulos",
title = "Statistical Analysis of Multicore {CPUs} Operation in
Scaled Voltage Conditions",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "109--112",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2798604",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Designers try to reduce the voltage margins of CPU
chips to gain energy without sacrificing reliable
operation. Statistical analysis methods are appealing
to predict the safe operational margins at the system
level as they do not induce area overheads and they can
be applied during manufacturing or after the chips'
release to the market. In this study, we present a
comprehensive statistical analysis of the behavior of
ARMv8 64-bit cores that are part of the enterprise
8-core X-Gene 2 micro-server family when they operate
in scaled voltage conditions. Our prediction schemes
that use real hardware counters as input are based on
linear regression models with several feature selection
techniques that aim to predict the safe voltage margins
of any given workload when the cores operate in scaled
conditions. Our findings show that our model is able to
accurately predict safe voltage margins that provide up
to 20.28\% power savings.",
acknowledgement = ack-nhfb,
affiliation = "Kaliorakis, M (Reprint Author), Univ Athens, Comp
Architecture Lab, Athens, Greece. Kaliorakis, Manolis;
Chatzidimitriou, Athanasios; Papadimitriou, George;
Gizopoulos, Dimitris, Univ Athens, Comp Architecture
Lab, Athens, Greece.",
author-email = "manoliskal@di.uoa.gr achatz@di.uoa.gr
georgepap@di.uoa.gr dgizop@di.uoa.gr",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "H2020 Programme of the European Union
through the UniServer Project [688540]",
funding-text = "This work is funded by the H2020 Programme of the
European Union through the UniServer Project (Grant
Agreement 688540).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "area overheads; ARMv8 cores; comprehensive statistical
analysis; Computational modeling; Computer crashes; CPU
chips; design margins; Energy-efficient computing;
enterprise 8-core X-Gene 2 microserver family; feature
selection; feature selection techniques; Hardware;
hardware counters; hardware reliability; Linear
regression; linear regression models; microprocessor
chips; multicore CPUs operation; multiprocessing
systems; power aware computing; power savings;
prediction schemes; Predictive models; regression
analysis; safe operational margins; safe voltage
margins; scaled voltage conditions; statistical
methods; system level; voltage margins; Voltage
measurement; word length 64 bit",
keywords-plus = "NOISE",
number-of-cited-references = "10",
ORCID-numbers = "Gizopoulos, Dimitris/0000-0002-1613-9061
Chatzidimitriou, Athanasios/0000-0001-8161-7165",
research-areas = "Computer Science",
researcherid-numbers = "Gizopoulos, Dimitris/U-2731-2018",
times-cited = "2",
unique-id = "Kaliorakis:2018:SAM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khoram:2018:AAA,
author = "Soroosh Khoram and Yue Zha and Jing Li",
title = "An Alternative Analytical Approach to Associative
Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "113--116",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2789424",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Associative Processing (AP) is a promising alternative
to the Von Neumann model as it addresses the memory
wall problem through its inherent in-memory
computations. However, because of the countless design
parameter choices, comparisons between implementations
of two so radically different models are challenging
for simulation-based methods. To tackle these
challenges, we develop an alternative analytical
approach based on a new concept called
architecturally-determined complexity. Using this
method, we asymptotically evaluate the
runtime/storage/energy bounds of the two models, i.e.,
AP and Von Neumann. We further apply the method to gain
more insights into the performance bottlenecks of
traditional AP and develop a new machine model named
Two Dimensional AP to address these limitations.
Finally, we experimentally validate our analytical
method and confirm that the simulation results match
our theoretical projections.",
acknowledgement = ack-nhfb,
affiliation = "Khoram, S (Reprint Author), Univ Wisconsin, Dept Elect
\& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA.
Khoram, Soroosh; Zha, Yue; Li, Jing, Univ Wisconsin,
Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI
53706 USA.",
author-email = "khoram@wisc.edu yzha.3@wisc.edu jli@ece.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "alternative analytical approach; analysis of
algorithms and problem complexity; analytical method;
Analytical models; architecturally-determined
complexity; associative processing; Associative
Processing; Associative processors; Complexity theory;
Computational modeling; Computer architecture;
content-addressable storage; countless design parameter
choices; in-memory computations; machine model; memory
wall problem; modeling techniques; models of
computation; Parallel processing; Runtime;
runtime-storage-energy bounds; simulation-based
methods; traditional AP; two dimensional AP; Two
dimensional displays; Von Neumann model",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Khoram:2018:AAA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khatamifard:2018:MSD,
author = "S. Karen Khatamifard and M. Hassan Najafi and Ali
Ghoreyshi and Ulya R. Karpuzcu and David J. Lilja",
title = "On Memory System Design for Stochastic Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "117--121",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2804926",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Growing uncertainty in design parameters (and
therefore, in design functionality) renders stochastic
computing particularly promising, which represents and
processes data as quantized probabilities. However, due
to the difference in data representation, integrating
conventional memory (designed and optimized for
non-stochastic computing) in stochastic computing
systems inevitably incurs a significant data conversion
overhead. Barely any stochastic computing proposal
to-date covers the memory impact. In this paper, as the
first study of its kind to the best of our knowledge,
we rethink the memory system design for stochastic
computing. The result is a seamless stochastic system,
StochMem, which features analog memory to trade the
energy and area overhead of data conversion for
computation accuracy. In this manner StochMem can
reduce the energy (area) overhead by up-to 52.8\%
(93.7\%) at the cost of at most 0.7\% loss in
computation accuracy.",
acknowledgement = ack-nhfb,
affiliation = "Khatamifard, SK (Reprint Author), Univ Minnesota,
Minneapolis, MN 55455 USA. Khatamifard, S. Karen;
Najafi, M. Hassan; Ghoreyshi, Ali; Karpuzcu, Ulya R.;
Lilja, David J., Univ Minnesota, Minneapolis, MN 55455
USA.",
author-email = "khatami@umn.edu najaf011@umn.edu ghore002@umn.edu
ukarpuzc@umn.edu lilja@umn.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[CCF-1408123, XPS-CCA-1438286]",
funding-text = "This work was supported in part by US National Science
Foundation grant no. CCF-1408123 and XPS-CCA-1438286.
Any opinions, findings and conclusions or
recommendations expressed in this material are those of
the authors and do not necessarily reflect the views of
the National Science Foundation.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "analog memory; Analog memory; analog memory;
computation accuracy; conventional memory; Data
conversion; data representation; design functionality;
design parameters; energy-efficient design; Image
processing; Image sensors; memory architecture; memory
impact; memory system design; near-sensor processing;
probability; seamless stochastic system; Sensors;
significant data conversion overhead; Stochastic
computing; stochastic computing proposal to-date;
stochastic computing systems; stochastic processes;
Stochastic systems; System analysis and design",
keywords-plus = "COMPUTATION",
number-of-cited-references = "16",
ORCID-numbers = "Najafi, M. Hassan/0000-0002-4655-6229 Lilja,
David/0000-0003-3785-8206",
research-areas = "Computer Science",
researcherid-numbers = "Najafi, M. Hassan/I-2952-2019",
times-cited = "1",
unique-id = "Khatamifard:2018:MSD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mouris:2018:TSB,
author = "Dimitris Mouris and Nektarios Georgios Tsoutsos and
Michail Maniatakos",
title = "{TERMinator} Suite: Benchmarking Privacy-Preserving
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "122--125",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2812814",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Security and privacy are fundamental objectives
characterizing contemporary cloud computing. Despite
the wide adoption of encryption for protecting data in
transit and at rest, data in use remains unencrypted
inside cloud processors and memories, as computation is
not applicable on encrypted values. This limitation
introduces security risks, as unencrypted values can be
leaked through side-channels or hardware Trojans. To
address this problem, encrypted architectures have
recently been proposed, which leverage homomorphic
encryption to natively process encrypted data using
datapaths of thousands of bits. In this case,
additional security protections are traded for higher
performance penalties, which drives the need for more
efficient architectures. In this work, we develop
benchmarks specifically tailored to homomorphic
computers, to enable comparisons across different
architectures. Our benchmark suite, dubbed TERMinator,
is unique as it avoids ``termination problems'' that
prohibit making control-flow decisions and evaluating
early termination conditions based on encrypted data,
as these can leak information. Contrary to generic
suites that ignore the fundamental challenges of
encrypted computation, our algorithms are tailored to
the security primitives of the target encrypted
architecture, such as the existence of branching
oracles. In our experiments, we compiled our benchmarks
for the Cryptoleq architecture and evaluated their
performance for a range of security parameters.",
acknowledgement = ack-nhfb,
affiliation = "Tsoutsos, NG (Reprint Author), NYU, New York, NY 10003
USA. Mouris, Dimitris, Univ Athens, GR-10679 Athens,
Greece. Tsoutsos, Nektarios Georgios; Maniatakos,
Michail, NYU, New York, NY 10003 USA.",
author-email = "jimouris@di.uoa.gr nektarios.tsoutsos@nyu.edu
michail.maniatakos@nyu.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NYU Abu Dhabi Global Ph.D. Student
Fellowship program",
funding-text = "This work was partially sponsored by the NYU Abu Dhabi
Global Ph.D. Student Fellowship program. D. Mouris
thanks Orestis Polychroniou for the fruitful
discussions.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Benchmarks; Cloud computing; cloud
computing; cloud processors; Computer architecture;
control-flow decisions; cryptography; Cryptoleq
architecture; data privacy; dubbed TERMinator;
encrypted architectures; encrypted computation;
encrypted data; encrypted values; Encryption; hardware
Trojans; higher performance penalties; homomorphic
computers; homomorphic encryption; leakage prevention;
performance evaluation; privacy-preserving architecture
benchmarking; Program processors; security parameters;
security protections; security risks; target encrypted
architecture; termination problem; TERMinator suite;
unencrypted values",
number-of-cited-references = "14",
ORCID-numbers = "Maniatakos, Michail/0000-0001-6899-0651",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Mouris:2018:TSB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Choukse:2018:CEM,
author = "Esha Choukse and Mattan Erez and Alaa Alameldeen",
title = "{CompressPoints}: an Evaluation Methodology for
Compressed Memory Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "126--129",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2821163",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Current memory technology has hit a wall trying to
scale to meet the increasing demands of modern client
and datacenter systems. Data compression is a promising
solution to this problem. Several compressed memory
systems have been proposed in the past years [1], [2],
[3], [4]. Unfortunately, a reasonable methodology to
evaluate these systems is missing. In this paper, we
identify the challenges for evaluating main memory
compression. We propose an effective methodology to
evaluate a compressed memory system by proposing
mechanisms to: (i) incorporate correct virtual address
translation, (ii) choose a region in the application
that is representative of the compression ratio, in
addition to regular metrics like IPC and cache hit
rates, and (iii) choose a representative region for
multi-core workloads, bringing down the correlation
error from 12.8 to 3.8 percent.",
acknowledgement = ack-nhfb,
affiliation = "Choukse, E (Reprint Author), Univ Texas Austin,
Austin, TX 78712 USA. Choukse, Esha; Erez, Mattan, Univ
Texas Austin, Austin, TX 78712 USA. Alameldeen, Alaa,
Intel Labs, Santa Clara, CA 95054 USA.",
author-email = "esha.choukse@utexas.edu mattan.erez@utexas.edu
alaa.r.alameldeen@intel.com",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; cache storage; compressed memory;
compressed memory system; Compression; compression
ratio; Computational modeling; computer centres;
Correlation; current memory technology; data
compression; datacenter systems; DRAM; evaluation;
evaluation methodology; Hardware; Linux; main memory
compression; Measurement; memory; memory architecture;
Memory management; methodology; modern client;
multi-core; multicore workloads; representative
regions; storage management; translation; workloads",
number-of-cited-references = "8",
ORCID-numbers = "Choukse, Esha/0000-0003-0371-5522",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Choukse:2018:CEM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2018:ZRV,
author = "Seikwon Kim and Wonsang Kwak and Changdae Kim and
Jaehyuk Huh",
title = "{Zebra} Refresh: Value Transformation for Zero-Aware
{DRAM} Refresh Reduction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "130--133",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2822808",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Refresh operations consume growing portions of DRAM
power with increasing DRAM capacity. To reduce the
power consumption of such refresh operations, this
paper proposes a novel value-aware refresh reduction
technique exploiting the abundance of zero values in
the memory contents. The proposed Zebra refresh
architecture transforms the value and mapping of DRAM
data to increase consecutive zero values, and skips a
refresh operation for a row containing zero values
entirely. Zebra converts memory blocks to base and
delta values, inspired by a prior compression
technique. Once values are converted, bits are
transposed to place consecutive zeros matching the
refresh granularity. The experimental results show
Zebra refresh can reduce DRAM refresh operations by 43
percent on average for a set of benchmark
applications.",
acknowledgement = ack-nhfb,
affiliation = "Huh, J (Reprint Author), Korea Adv Inst Sci \&
Technol, Sch Comp, Daejeon 34141, South Korea. Kim,
Seikwon; Kwak, Wonsang; Kim, Changdae; Huh, Jaehyuk,
Korea Adv Inst Sci \& Technol, Sch Comp, Daejeon 34141,
South Korea. Kim, Seikwon, Samsung Elect Co Ltd,
Samsung Res, Suwon 443803, Gyeonggi Do, South Korea.",
author-email = "seikwon@calab.kaist.ac.kr wskwak@calab.kaist.ac.kr
cdkim@calab.kaist.ac.kr jhuh@calab.kaist.ac.kr",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea
[NRF-2016R1A2B4013352]; Institute for Information \&
communications Technology Promotion [IITP-2017-000466];
Ministry of Science and ICT, Korea",
funding-text = "This work is supported by the National Research
Foundation of Korea (NRF-2016R1A2B4013352) and by the
Institute for Information \& communications Technology
Promotion (IITP-2017-000466). Both grants are funded by
the Ministry of Science and ICT, Korea.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; data compression; data content
conversion; data reduction; DRAM chips; DRAM data; DRAM
energy; DRAM power; DRAM refresh; DRAM refresh
operations; memory contents; Memory management;
Microprocessors; power aware computing; power
consumption; Power demand; Random access memory;
refresh granularity; Transforms; value transformation;
value-aware refresh reduction; Zebra refresh
architecture; zero values; zero-aware DRAM refresh
reduction",
keywords-plus = "ENERGY",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2018:ZRV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kwon:2018:CMC,
author = "Youngeun Kwon and Minsoo Rhu",
title = "A Case for Memory-Centric {HPC} System Architecture
for Training Deep Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "134--138",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2823302",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As the models and the datasets to train deep learning
(DL) models scale, system architects are faced with new
challenges, one of which is the memory capacity
bottleneck, where the limited physical memory inside
the accelerator device constrains the algorithm that
can be studied. We propose a memory-centric deep
learning system that can transparently expand the
memory capacity accessible to the accelerators while
also providing fast inter-device communication for
parallel training. Our proposal aggregates a pool of
memory modules locally within the device-side
interconnect, which are decoupled from the host
interface and function as a vehicle for transparent
memory capacity expansion. Compared to conventional
systems, our proposal achieves an average 2: 1 x
speedup on eight DL applications and increases the
system-wide memory capacity to tens of TBs.",
acknowledgement = ack-nhfb,
affiliation = "Rhu, M (Reprint Author), Pohang Univ Sci \& Technol,
Pohang 790784, Gyeongsangbuk D, South Korea. Kwon,
Youngeun; Rhu, Minsoo, Pohang Univ Sci \& Technol,
Pohang 790784, Gyeongsangbuk D, South Korea.",
author-email = "kyeg9404@gmail.com minsoo.rhu@gmail.com",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Samsung Research Funding Center of Samsung
Electronics [SRFC-TB1703-03]",
funding-text = "This work was supported by Samsung Research Funding
Center of Samsung Electronics under Project Number
SRFC-TB1703-03.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer architecture; conventional
systems; deep learning; deep learning models scale;
device-side interconnect; fast inter-device
communication; Graphics processing units; hardware
acceleration; learning (artificial intelligence);
Machine learning; memory architecture; memory capacity
bottleneck; memory modules; memory-centric deep
learning system; memory-centric HPC system
architecture; neural nets; neural network; parallel
processing; parallel training; Performance evaluation;
shared memory systems; storage management; system
architects; system architecture; system-wide memory
capacity; Systems architecture; Training; training deep
neural networks; transparent memory capacity expansion;
Virtualization",
keywords-plus = "DESIGN",
number-of-cited-references = "18",
research-areas = "Computer Science",
researcherid-numbers = "Rhu, Minsoo/O-6167-2018",
times-cited = "0",
unique-id = "Kwon:2018:CMC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ipek:2018:BLL,
author = "Engin Ipek and Florian Longnos and Shihai Xiao and Wei
Yang",
title = "Bit-Level Load Balancing: a New Technique for
Improving the Write Throughput of Deeply Scaled
{STT-MRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "139--142",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2819979",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Emerging non-volatile memories (NVMs) have drawn
significant attention as potential DRAM replacements.
STT-MRAM is one of the most promising NVMs due to its
relatively low write energy, high speed, and high
endurance. However, STT-MRAM suffers from its own
scaling problems. As the size of the access transistor
is decreased to reduce the cell area, the magnitude of
the switching current that is supplied to the storage
element decreases. The reduced switching current
significantly lengthens the switching time, which makes
write throughput a significant performance bottleneck
for a memory system constructed from dense STT-MRAM
cells. We introduce bit-level load balancing, a new
technique that mitigates the performance overhead of
limited write throughput in high-density, STT-MRAM
based main memories. Bit-level load balancing takes
advantage of the observation that many of the bits
within a row of STT-MRAM remain unchanged when
performing a write. The key idea is to architect the
memory system such that different columns of different
rows can be simultaneously written to an STT-MRAM
subarray. By interleaving in time the bit updates from
multiple writes, bit level load balancing improves
average system performance by 19 percent, and comes
within 6 percent of the performance of a DRAM based
system.",
acknowledgement = ack-nhfb,
affiliation = "Ipek, E (Reprint Author), Univ Rochester, Dept Comp
Sci, CSB Room 422, Rochester, NY 14627 USA. Ipek, E
(Reprint Author), Univ Rochester, Dept Elect \& Comp
Engn, CSB Room 422, Rochester, NY 14627 USA. Ipek,
Engin, Univ Rochester, Dept Comp Sci, CSB Room 422,
Rochester, NY 14627 USA. Ipek, Engin, Univ Rochester,
Dept Elect \& Comp Engn, CSB Room 422, Rochester, NY
14627 USA. Longnos, Florian; Xiao, Shihai; Yang, Wei,
Huawei Technol Co Ltd, Shenzhen 115371, Guangdong,
Peoples R China.",
author-email = "ipek@cs.rochester.edu florian.longnos@huawei.com
xiaoshihai@huawei.com william.yangwei@huawei.com",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bit level load balancing; bit-level load balancing;
Computer architecture; deeply scaled STT-MRAM; dense
STT-MRAM cells; DRAM chips; Load management; memory
system; memory systems; Microprocessors; MRAM devices;
non-volatile memories; nonvolatile memories; NVMs;
performance bottleneck; Random access memory; resource
allocation; STT-MRAM; STT-MRAM based main memories;
STT-MRAM subarray; Switches; Throughput; Transistors;
write throughput",
keywords-plus = "PERFORMANCE; DESIGN; ENERGY",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Ipek:2018:BLL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Iliakis:2018:DMS,
author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
Soudris",
title = "Decoupled {MapReduce} for Shared-Memory Multi-Core
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "143--146",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2827929",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Modern multi-core processors exhibit high integration
densities, e.g., up to several tens of cores. Multiple
programming frameworks have emerged to facilitate the
development of highly parallel applications. The
MapReduce programming model, after having demonstrated
its usability in the area of distributed computing
systems, has been adapted to the needs of shared-memory
multi-processors showing promising results in
comparison with conventional multi-threaded libraries,
e.g., pthreads. In this paper we enhance the
traditional MapReduce architecture by decoupling the
map and combine phases in order to boost parallel
execution. We show that combiners' memory intensive
features limit the system's degree of parallelism, thus
resulting in sub-optimal hardware utilization, leaving
space for further performance improvements. The
proposed decoupled MapReduce architecture is evaluated
into a NUMA server platform, showing that the adoption
of the De-MapR runtime enables more efficient hardware
utilization and competent run-time improvements. We
demonstrate that the proposed solution achieves
execution speedups of up to 2.46x compared to a
state-of-the-art, shared-memory MapReduce library.",
acknowledgement = ack-nhfb,
affiliation = "Iliakis, K (Reprint Author), Natl Tech Univ Athens,
Zografos 15780, Greece. Iliakis, Konstantinos; Xydis,
Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens,
Zografos 15780, Greece.",
author-email = "konstantinos.iliakis@cern.ch sxydis@microlab.ntua.gr
dsoudris@microlab.ntua.gr",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Containers; decoupled MapReduce
architecture; distributed computing systems; hardware
utilization; highly parallel applications; Instruction
sets; Libraries; MapReduce; MapReduce programming
model; modern multicore processors; multi-cores;
multiple programming frameworks; parallel
architectures; parallel execution; Parallel processing;
parallel programming; Runtime; runtime systems; shared
memory systems; shared-memory MapReduce library;
shared-memory multicore architectures; shared-memory
multiprocessors; sub-optimal hardware utilization; Task
analysis",
number-of-cited-references = "13",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
times-cited = "0",
unique-id = "Iliakis:2018:DMS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Li:2018:BSB,
author = "Zhaoshi Li and Leibo Liu and Yangdong Deng and Shouyi
Yin and Shaojun Wei",
title = "Breaking the Synchronization Bottleneck with
Reconfigurable Transactional Execution",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "147--150",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2828402",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The advent of FPGA-based hybrid architecture offers
the opportunity of customizing memory subsystems to
enhance the overall system performance. However, it is
not straightforward to design efficient FPGA circuits
for emerging FPGAs applications such as in-memory
database and graph analytics, which heavily depend on
concurrent data structures (CDS'). Highly dynamic
behaviors of CDS' have to be orchestrated by
synchronization primitives for correct execution. These
primitives induce overwhelming memory traffic for
synchronizations on FPGAs. This paper proposes a novel
method for systematically exploring and exploiting
memory-level parallelism (MLP) of CDS by transactional
execution on FPGAs. Inspired by the idea that semantics
of transactions can be implemented in a more efficient
and scalable manner on FPGAs than on CPUs, we propose a
transaction-based reconfigurable runtime system for
capturing MLP of CDS'. Experiments on linked-list and
skip-list show our approach achieves 5.18x and 1.55x
throughput improvement on average than lock-based FPGA
implementations and optimized CDS algorithms on a
state-of-the-art multi-core CPU respectively.",
acknowledgement = ack-nhfb,
affiliation = "Liu, LB (Reprint Author), Tsinghua Univ, Natl Lab
Informat Sci \& Technol, Beijing 100084, Peoples R
China. Li, Zhaoshi; Liu, Leibo; Deng, Yangdong; Yin,
Shouyi; Wei, Shaojun, Tsinghua Univ, Natl Lab Informat
Sci \& Technol, Beijing 100084, Peoples R China.",
author-email = "li-zs12@mail.tsinghua.edu.cn liulb@tsinghua.edu.cn
dengyd@tsinghua.edu.cn yinsy@tsinghua.edu.cn
wsj@tsinghua.edu.cn",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Natural Science Foundation of
China [61672317]; National Science Technology Major
Project [2016ZX01012101]",
funding-text = "This work was supported in part by National Natural
Science Foundation of China (No. 61672317) and National
Science Technology Major Project (No.
2016ZX01012101).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "concurrent data structures; data structures; Data
structures; data structures; efficient FPGA circuits;
field programmable gate arrays; Field programmable gate
arrays; FPGA-based hybrid architecture; graph
analytics; heterogeneous systems; highly dynamic
behaviors; in-memory database; Instruction sets; memory
subsystems; memory traffic; memory-level parallelism;
MLP; multicore CPU; optimized CDS algorithms; parallel
architectures; Programming; Reconfigurable hardware;
reconfigurable transactional execution; Semantics;
synchronisation; Synchronization; synchronization
bottleneck; synchronization primitives; system
performance enhancement; Throughput; transaction-based
reconfigurable runtime system",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Li:2018:BSB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ipek:2018:VWC,
author = "Engin Ipek and Florian Longnos and Shihai Xiao and Wei
Yang",
title = "Vertical Writes: Closing the Throughput Gap between
Deeply Scaled {STT-MRAM} and {DRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "151--154",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2820027",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "STT-MRAMis a second generation MRAM technology that
addresses many of the scaling problems of earlier
generation magnetic RAMs, and is a promising candidate
to replace DRAM due to its high operational speed,
scalable energy characteristics, and high write
endurance. However, making the density of STT-MRAM
competitive with that of DRAM while maintaining
DRAM-like write throughput has proven challenging.
Reducing the area of an STT-MRAM cell requires
decreasing the width of the cell access transistor,
which lowers the magnitude of the switching current
supplied to the storage element during writes, and
significantly hampers the switching speed.
Consequently, write throughput constitutes a
fundamental performance bottleneck for memory systems
built from deeply scaled, dense STT-MRAM cells. This
paper introduces vertical writes, a new technique that
improves the write throughput of memory systems built
from high-density STT-MRAM. Vertical writes exploit the
observation that once the switching voltage has been
applied across the bit lines and source lines in an
STT-MRAM array, it is possible to initiate the write
operation for additional cells that are attached to the
same column by simply turning on the corresponding word
lines. By leveraging the ability to write a 0 or a 1 to
multiple cells at once, vertical writes improve average
system performance by 21 percent, and enable an
STT-MRAM based system to come within 5 percent of the
performance of a DRAM based system.",
acknowledgement = ack-nhfb,
affiliation = "Ipek, E (Reprint Author), Univ Rochester, Rochester,
NY 14627 USA. Ipek, Engin, Univ Rochester, Rochester,
NY 14627 USA. Longnos, Florian; Xiao, Shihai; Yang,
Wei, Huawei Technol Co Ltd, Shenzhen 518129, Guangdong,
Peoples R China.",
author-email = "ipek@cs.rochester.edu florian.longnos@huawei.com
florian.longnos@huawei.com william.yangwei@huawei.com",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cell access transistor; Computer architecture;
Decoding; deeply scaled STT-MRAM cells; dense STT-MRAM
cells; DRAM based system; DRAM chips; DRAM-like write
throughput; earlier generation magnetic RAMs;
generation MRAM technology; high operational speed;
high write endurance; high-density STT-MRAM; magnetic
tunnelling; Memory systems; memory systems; Memory
systems; Microprocessors; MRAM devices; non-volatile
memories; Random access memory; random-access storage;
scalable energy characteristics; STT-MRAM; STT-MRAM
array; STT-MRAM based system; Switches; switching
current; switching speed; Throughput; throughput gap;
write operation; Writing",
keywords-plus = "PERFORMANCE; DESIGN",
number-of-cited-references = "23",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Ipek:2018:VWC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gan:2018:AIC,
author = "Yu Gan and Christina Delimitrou",
title = "The Architectural Implications of Cloud
Microservices",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "155--158",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2839189",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cloud services have recently undergone a shift from
monolithic applications to microservices, with hundreds
or thousands of loosely-coupled microservices
comprising the end-to-end application. Microservices
present both opportunities and challenges when
optimizing for quality of service (QoS) and cloud
utilization. In this paper we explore the implications
cloud microservices have on system bottlenecks, and
datacenter server design. We first present and
characterize an end-to-end application built using tens
of popular open-source microservices that implements a
movie renting and streaming service, and is modular and
extensible. We then use the end-to-end service to study
the scalability and performance bottlenecks of
microservices, and highlight implications they have on
the design of datacenter hardware. Specifically, we
revisit the long-standing debate of brawny versus wimpy
cores in the context of microservices, we quantify the
I-cache pressure they introduce, and measure the time
spent in computation versus communication between
microservices over RPCs. As more cloud applications
switch to this new programming model, it is
increasingly important to revisit the assumptions we
have previously used to build and manage cloud
systems.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Cornell Univ, Ithaca,
NY 14850 USA. Gan, Yu; Delimitrou, Christina, Cornell
Univ, Ithaca, NY 14850 USA.",
author-email = "lyg397@cornell.edu delimitrou@cornell.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application studies resulting in better
multiple-processor systems; architectural implications;
Cloud computing; cloud computing; cloud microservices;
cloud utilization; computer centres; datacenter server
design; distributed applications; Electric breakdown;
end-to-end service; Motion pictures; movie renting;
Open source software; open-source microservices; power
aware computing; QoS; quality of service; Quality of
service; quality of service; Servers; streaming
service; Super (very large) computers; Videos",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Gan:2018:AIC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Shwartz:2018:DMI,
author = "Ofir Shwartz and Yitzhak Birk",
title = "Distributed Memory Integrity Trees",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "159--162",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2822705",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Ensuring the correct execution of a program running on
untrusted computing platforms, wherein the OS,
hypervisor, and all off-CPU-chip hardware, including
memory, are untrusted, (also) requires protecting the
integrity of the memory content against replay attacks.
This requires dedicated tracking structures and in-chip
state storage. For this purpose, integrity trees are
used in various forms, varying in complexity, size, and
performance; yet, existing integrity trees do not
address distributed, shared-memory computations, for
which one must also ensure the integrity of the
coherence state of the memory. Observing that a block
not residing at a given node merely needs to be known
by that node as such, we present the novel Distributed
Integrity Tree (DIT) method, and show that it can be
used effectively to extend existing integrity trees to
parallel and distributed environments. Using DIT, we
constructed a Distributed Merkle Tree, a Distributed
Bonsai Merkle Tree, and a distributed Intel SGX's
Memory Encryption Engine integrity mechanism. All these
extensions entail negligible overhead.",
acknowledgement = ack-nhfb,
affiliation = "Shwartz, O (Reprint Author), Technion, Elect Engn
Dept, IL-3200003 Haifa, Israel. Shwartz, Ofir; Birk,
Yitzhak, Technion, Elect Engn Dept, IL-3200003 Haifa,
Israel.",
author-email = "ofirshw@tx.technion.ac.il birk@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Hasso Plattner Institute",
funding-text = "This work was supported in part by the Hasso Plattner
Institute.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "coherence state; computer security; correct execution;
cryptography; data integrity; Data transfer;
distributed Bonsai Merkle tree; Distributed computing;
Distributed databases; distributed environment;
distributed integrity tree method; distributed Intel
SGX's Memory Encryption Engine integrity mechanism;
distributed memory integrity; Encryption; hypervisor;
in-chip state storage; integrity tree; memory content;
Memory management; Metadata; off-CPU-chip hardware;
operating systems (computers); parallel environment;
parallel processing; shared memory; shared memory
systems; shared-memory computations; trees
(mathematics); trusted computing; untrusted computing
platforms",
keywords-plus = "PERFORMANCE",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Shwartz:2018:DMI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yun:2018:RPP,
author = "Ji-Tae Yun and Su-Kyung Yoon and Jeong-Geun Kim and
Bernd Burgstaller and Shin-Dug Kim",
title = "Regression Prefetcher with Preprocessing for
{DRAM--PCM} Hybrid Main Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "163--166",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2841835",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This research is to design an effective hybrid main
memory structure for graph processing applications,
because it is quite expensive to use only high-speed
DRAM for such applications. Thus, we propose a DRAM-PCM
hybrid main memory structure to reduce the cost and
energy consumption and design regression prefetch
scheme to cope with irregular access patterns in large
graph processing workloads. In addition, the prefetch
includes preprocessing algorithm to maximize
prefetching performance. Our experimental evaluation
shows a performance improvement of 36 percent over a
conventional DRAM model, 15 percent over existing
prefetch models such as GHB/PC, SMS, and AMPM, and 6
percent over the latest model.",
acknowledgement = ack-nhfb,
affiliation = "Kim, SD (Reprint Author), Yonsei Univ, Dept Comp Sci,
Seoul 03722, South Korea. Yun, Ji-Tae; Yoon, Su-Kyung;
Kim, Jeong-Geun; Burgstaller, Bernd; Kim, Shin-Dug,
Yonsei Univ, Dept Comp Sci, Seoul 03722, South Korea.",
author-email = "jty11@yonsei.ac.kr sk.yoon@yonsei.ac.kr
junggeun@yonsei.ac.kr bburg@yonsei.ac.kr
sdkim@yonsei.ac.kr",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Next Generation Information Computing
Development Program through the National Research
Foundation of Korea (NRF) --- Ministry of Science, ICT
\& Future Planning [NRF-2015M3C4A7065522]; Samsung
Electronics; Yonsei University",
funding-text = "This research was partially supported by the Next
Generation Information Computing Development Program
through the National Research Foundation of Korea (NRF)
funded by the Ministry of Science, ICT \& Future
Planning (NRF-2015M3C4A7065522) and by an
Industry-Academy joint research program between Samsung
Electronics and Yonsei University.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "buffer management; conventional DRAM model; cost
reduction; design regression prefetch scheme; DRAM
chips; effective hybrid main memory structure; energy
consumption reduction; Engines; graph processing
applications; graph theory; high-speed DRAM; irregular
access patterns; large graph processing workloads; Load
modeling; machine learning; main memory; Memory
management; PCM; Phase change materials; phase change
memories; prefetch models; Prefetching; prefetching
performance; preprocessing algorithm; Random access
memory; storage management; Training data",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Yun:2018:RPP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhang:2018:RFA,
author = "Jiangwei Zhang and Donald {Kline, Jr.} and Long Fang
and Rami Melhem and Alex K. Jones",
title = "{RETROFIT}: Fault-Aware Wear Leveling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "167--170",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2840137",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Phase-change memory (PCM) and resistive memory (RRAM)
are promising alternatives to traditional memory
technologies. However, both PCM and RRAM suffer from
limited write endurance and due to process variation
from scaling, increasing number of early cell failures
continue to put pressure on wear-leveling and fault
tolerance techniques. In this paper, we propose
RETROFIT, which leverages the spare ``gap'' row used as
temporary storage in wear leveling to also be used
strategically to guard against early cell wear out.
RETROFIT is compatible with error correction schemes
targeted at mitigating stuck-at faults and provides
benefits when single or multiple spare rows are
available. RETROFIT enhances lifetime by as much as 107
percent over traditional gap-based wear leveling and 8
percent over perfectly uniform wear leveling with a
similar overhead. Furthermore, RETROFIT scales better
than wear-leveling combined with error correction as
process variation increases.",
acknowledgement = ack-nhfb,
affiliation = "Zhang, JW (Reprint Author), Natl Univ Def Technol,
Changsha 410073, Hunan, Peoples R China. Zhang, JW
(Reprint Author), Univ Pittsburgh, ECE Dept,
Pittsburgh, PA 15261 USA. Zhang, Jiangwei; Fang, Long,
Natl Univ Def Technol, Changsha 410073, Hunan, Peoples
R China. Zhang, Jiangwei; Fang, Long, Univ Pittsburgh,
ECE Dept, Pittsburgh, PA 15261 USA. Melhem, Rami, Univ
Pittsburgh, CS Dept, Pittsburgh, PA 15260 USA.",
author-email = "jiz148@pitt.edu dek61@pitt.edu lfang@nudt.edu.cn
melhem@cs.pitt.edu akjones@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Aging; and fault-tolerance; Computer architecture;
early cell failures; early cell wear; Emerging
memories; error correction; Error correction; Error
correction codes; fault tolerance; fault tolerance
techniques; fault-aware wear leveling; fault-tolerance;
multiple spare rows; PCM; perfectly uniform wear
leveling; Phase change materials; phase change
memories; process variation; Random access memory;
random-access storage; Registers; reliability;
resistive memory; RETROFIT scales; RRAM; single rows;
spare gap row; traditional memory technologies; wear;
wear-leveling",
number-of-cited-references = "15",
ORCID-numbers = "Kline, Jr, Donald/0000-0002-4414-1513",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Zhang:2018:RFA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kulkarni:2018:LAI,
author = "Neeraj Kulkarni and Feng Qi and Christina Delimitrou",
title = "Leveraging Approximation to Improve Datacenter
Resource Efficiency",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "171--174",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2845841",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cloud multi-tenancy is typically constrained to a
single interactive service colocated with one or more
batch, low-priority services, whose performance can be
sacrificed. Approximate computing applications offer
the opportunity to enable tighter colocation among
multiple applications whose performance is important.
We present Pliant, a lightweight cloud runtime that
leverages the ability of approximate computing
applications to tolerate some loss in output quality to
boost the utilization of shared servers. During periods
of high contention, Pliant employs incremental and
interference-aware approximation to reduce interference
in shared resources. We evaluate Pliant across
different approximate applications, and show that it
preserves QoS for all co-scheduled workloads, while
incurring at most a 5 percent loss in output quality.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Cornell Univ, Ithaca,
NY 14850 USA. Kulkarni, Neeraj; Qi, Feng; Delimitrou,
Christina, Cornell Univ, Ithaca, NY 14850 USA.",
author-email = "nsk49@cornell.edu fq26@cornell.edu
delimitrou@cornell.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Approximate computing; approximate computing
applications; cloud computing; Cloud computing; cloud
multitenancy; co-scheduled workloads; computer centres;
datacenter resource efficiency; Interference;
interference-aware approximation; lightweight cloud
runtime; low-priority services; Monitoring; Pliant;
QoS; quality of service; Quality of service; Runtime;
scheduling; scheduling and task partitioning; shared
resources; single interactive service; Super (very
large) computers; support for dynamic compilation;
Switches",
keywords-plus = "ACCURACY-AWARE OPTIMIZATION; PROGRAMS",
number-of-cited-references = "20",
ORCID-numbers = "Qi, Feng/0000-0002-0759-5268 Kulkarni,
Neeraj/0000-0003-0768-0187",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kulkarni:2018:LAI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{AlBarakat:2018:MFM,
author = "Laith M. AlBarakat and V. Paul Gratz and Daniel A.
Jim{\'e}nez",
title = "{MTB-Fetch}: Multithreading Aware Hardware Prefetching
for Chip Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "175--178",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2847345",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To fully exploit the scaling performance in Chip
Multiprocessors, applications must be divided into
semi-independent processes that can run concurrently on
multiple cores within a system. One major class of such
applications, shared-memory, multi-threaded
applications, requires programmers insert thread
synchronization primitives (i.e., locks, barriers, and
condition variables) in their critical sections to
synchronize data access between processes. For this
class of applications, scaling performance requires
balanced per-thread workloads with little time spent in
critical sections. In practice, however, threads often
waste significant time waiting to acquire
locks/barriers in their critical sections, leading to
thread imbalance and poor performance scaling.
Moreover, critical sections often stall data
prefetchers that mitigate the effects of long critical
section stalls by ensuring data is preloaded in the
core caches when the critical section is complete. In
this paper we examine a pure hardware technique to
enable safe data prefetching beyond synchronization
points in CMPs. We show that successful prefetching
beyond synchronization points requires overcoming two
significant challenges in existing prefetching
techniques. First, we find that typical data
prefetchers are designed to trigger prefetches based on
current misses. This approach this works well for
traditional, continuously executing, single-threaded
applications. However, when a thread stalls on a
synchronization point, it typically does not produce
any new memory references to trigger a prefetcher.
Second, even in the event that a prefetch were to be
correctly directed to read beyond a synchronization
point, it will likely prefetch shared data from another
core before this data has been written. While this
prefetch would be considered ``accurate'' it is highly
undesirable, because such a prefetch would lead to
three extra ``ping-pong'' movements back and forth
between private caches in the producing and consuming
cores, incurring more latency and energy overhead than
without prefetching. We develop a new data prefetcher,
Multi-Thread B-Fetch (MTBFetch), built as an extension
to a previous single-threaded data prefetcher. MTBFetch
addresses both issues in prefetching for shared memory
multi-threaded workloads. MTB-Fetch achieves a speedup
of 9.3 percent for multi-threaded applications with
little additional hardware.",
acknowledgement = ack-nhfb,
affiliation = "AlBarakat, LM (Reprint Author), Texas A\&M Univ, Dept
Elect \& Comp Engn, College Stn, TX 77843 USA.
AlBarakat, Laith M.; Gratz, Paul, V, Texas A\&M Univ,
Dept Elect \& Comp Engn, College Stn, TX 77843 USA.
Jimenez, Daniel A., Texas A\&M Univ, Dept Comp Sci \&
Engn, College Stn, TX 77843 USA.",
author-email = "lalbarakat@tamu.edu pgratz@tamu.edu
djimenez@cse.tamu.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation
[I/UCRC-1439722, CCF-1649242, CCF-1216604/1332598];
Intel Corp.",
funding-text = "We thank the National Science Foundation, which
partially supported this work through grants
I/UCRC-1439722, CCF-1649242 and CCF-1216604/1332598 and
Intel Corp. for their generous support.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Chip multiprocessor; Chip
Multiprocessors; CMPs; core caches; data access
synchronization; energy overhead; Hardware; hardware
prefetching; long critical section stalls;
microprocessor chips; MTB-Fetch; multi-threading;
Multicore processing; multiple cores; multithread
B-fetch; multithreading aware hardware prefetching;
per-thread workloads; poor performance scaling;
Prefetching; prefetching techniques; private caches;
pure hardware technique; Scalability; scaling
performance; semiindependent processes; shared memory;
shared memory multithreaded workloads; shared memory
systems; single-threaded applications; single-threaded
data prefetcher; storage management; synchronisation;
Synchronization; synchronization point; thread
imbalance; thread synchronization primitives; typical
data prefetchers",
keywords-plus = "PROCESSORS",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "AlBarakat:2018:MFM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Vijayaraghavan:2018:MBA,
author = "Thiruvengadam Vijayaraghavan and Amit Rajesh and
Karthikeyan Sankaralingam",
title = "{MPU--BWM}: Accelerating Sequence Alignment",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "179--182",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2849064",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DNA sequencing and assembly spans life-altering
applications like disease diagnosis to answering
questions about our ancestry. Sequencing involves
state-of-the-art machines generating nucleic acid
sequences (AGCT) from wet samples like blood or salvia,
followed by aligning these sequences against known
reference sequences. Due to the rapid advancement in
sequence generation machines relative to Moore's law,
the second step (alignment) has now become the
bottleneck. Today's state-of-the-art technology for
alignment runs software like BWA-MEM on a cluster of
high performance general purpose machines that cannot
keep up with the rapid rate of data generated by each
new generation of sequencer machines. Recent proposals
from academia that claim orders of magnitude alignment
speedup come at a cost of significant disruption to the
hardware and software currently in use in the industry.
In this work, we propose MPU-BWM, a hardware-software
solution that achieves orders of magnitude speedup (57
x over single core x86) on the state-of-the-art BWA-MEM
algorithm, with non-intrusive integration to existing
processing clusters and with minimal modifications to
the BWA-MEM software.",
acknowledgement = ack-nhfb,
affiliation = "Vijayaraghavan, T (Reprint Author), SimpleMachines
Inc, Madison, WI 53719 USA. Vijayaraghavan,
Thiruvengadam; Sankaralingam, Karthikeyan,
SimpleMachines Inc, Madison, WI 53719 USA. Rajesh,
Amit, James Madison Mem High Sch, Madison, WI 53717
USA. Sankaralingam, Karthikeyan, Univ Wisconsin,
Madison, WI 53706 USA.",
author-email = "vijay@simplemachinesinc.com amitrajesh200@gmail.com
karu@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "assembly spans life-altering applications;
bioinformatics; bioinformatics (genome or protein)
databases; BWA-MEM software; disease diagnosis;
diseases; DNA; DNA sequencing; Engines; genomics;
Hardware; hardware-software solution; Heterogeneous
(hybrid) systems; high performance general purpose
machines; magnitude alignment speedup; Moore's law;
MPU-BWM; nucleic acid sequences; parallel
architectures; parallel processing; Pipelines; Program
processors; reference sequences; Rockets; sequence
alignment; sequence generation machines; sequencer
machines; sequences; Sequential analysis; sequential
machines",
number-of-cited-references = "15",
ORCID-numbers = "Rajesh, Amit/0000-0003-1679-5517",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Vijayaraghavan:2018:MBA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{DePestel:2018:RRP,
author = "Sander {De Pestel} and Sam {Van den Steen} and Shoaib
Akram and Lieven Eeckhout",
title = "{RPPM}: Rapid Performance Prediction of Multithreaded
Applications on Multicore Hardware",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "183--186",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2849983",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper proposes RPPM which, based on a
microarchitecture-independent profile of a
multithreaded application, predicts its performance on
a previously unseen multicore platform. RPPM breaks up
multithreaded program execution into epochs based on
synchronization primitives, and then predicts per-epoch
active execution times for each thread and
synchronization overhead to arrive at a prediction for
overall application performance. RPPM predicts
performance within 12 percent on average (27 percent
max error) compared to cycle-level simulation. We
present a case study to illustrate that RPPM can be
used for making accurate multicore design trade-offs
early in the design cycle.",
acknowledgement = ack-nhfb,
affiliation = "De Pestel, S (Reprint Author), Univ Ghent, B-9000
Ghent, Belgium. De Pestel, Sander; Van den Steen, Sam;
Akram, Shoaib; Eeckhout, Lieven, Univ Ghent, B-9000
Ghent, Belgium.",
author-email = "sander.depestel@ugent.be sam.vandensteen@ugent.be
shoaib.akram@ugent.be lieven.eeckhout@ugent.be",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Agency for Innovation by Science and
Technology in Flanders (IWT); European Research Council
(ERC) [741097]",
funding-text = "Sander De Pestel is supported through a doctoral
fellowship by the Agency for Innovation by Science and
Technology in Flanders (IWT). Additional support is
provided through the European Research Council (ERC)
Advanced Grant agreement no. 741097.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accurate multicore design trade-offs; active execution
times; Computational modeling; Instruction sets;
Mathematical model; micro-architecture;
Microarchitecture; microarchitecture-independent
profile; microprocessor chips; Modeling;
multi-threaded; multi-threading; multicore hardware;
Multicore processing; multiprocessing systems;
multithreaded application; multithreaded program
execution; performance; Predictive models; rapid
performance prediction; RPPM; Synchronization;
synchronization overhead; synchronization primitives;
unseen multicore platform",
number-of-cited-references = "12",
ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Pestel:2018:RRP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhao:2018:KOA,
author = "Wenyi Zhao and Quan Chen and Minyi Guo",
title = "{KSM}: Online Application-Level Performance Slowdown
Prediction for Spatial Multitasking {GPGPU}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "187--191",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2851207",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Colocating multiple applications on the same spatial
multitasking GPGPU improves the system-wide throughput.
However, the colocated applications are slowed down
differently due to the contention on streaming
multiprocessors (SMs), L2 cache and global memory
bandwidth. The ability to precisely predict application
slowdowns online is useful in many scenarios, e.g.,
ensuring fair pricing in multi-tenant Cloud systems.
Prior work on predicting application slowdown is either
inaccurate, due to the ignoring of contention on SMs,
or inefficient, due to the expensive sequential
profiling of concurrent applications via runtime
environment switching. To solve the above problem, we
propose KSM that enables precise and efficient
application-level slowdown prediction without priori
application knowledge. KSM is proposed based on the
observation that hardware event statistics caused by
the colocated applications are strongly correlated with
their slowdowns. In more detail, KSM builds a slowdown
model based on the hardware event statistics using
machine learning techniques offline. At runtime, KSM
collects the hardware event statistics, and predicts
the slowdowns of all the colocated applications based
on the model. Our experimental results show that KSM
has negligible runtime overhead and precisely predicts
the application-level slowdowns with the prediction
error smaller than 9.7 percent.",
acknowledgement = ack-nhfb,
affiliation = "Zhao, WY (Reprint Author), Shanghai Jiao Tong Univ,
Dept Comp Sci \& Engn, Shanghai 200240, Peoples R
China. Zhao, Wenyi; Chen, Quan; Guo, Minyi, Shanghai
Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200240,
Peoples R China.",
author-email = "wenyizhao@sjtu.edu.cn chen-quan@cssjtu.edu.cn
guo-my@cssjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Basic Research 973 Program of
China [2015CB352403]; National Natural Science
Foundation of China (NSFC) [61602301, 61632017]",
funding-text = "This work is partially sponsored by the National Basic
Research 973 Program of China (No. 2015CB352403), the
National Natural Science Foundation of China (NSFC)
(61602301, 61632017). Quan Chen and Minyi Guo are
co-corresponding authors of this paper.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application-level slowdowns; Bandwidth; cloud
computing; colocated applications; computer centres;
concurrent applications; Graphics processing units;
graphics processing units; Hardware; hardware event
statistics; interference; Interference; interference;
Kernel; KSM; learning (artificial intelligence);
machine learning technique; multiprocessing systems;
multitenant cloud systems; online application-level
performance slowdown prediction; precise
application-level slowdown prediction; priori
application knowledge; Resource management;
scalability; Slowdown prediction; SM; spatial
multitasking GPGPU; spatial multitasking GPGPUs;
system-wide throughput; Training",
number-of-cited-references = "13",
ORCID-numbers = "Zhao, Wenyi/0000-0001-7308-9542 Chen,
Quan/0000-0001-5832-0347",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zhao:2018:KOA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Swami:2018:AAS,
author = "Shivam Swami and Kartik Mohanram",
title = "{ARSENAL}: Architecture for Secure Non-Volatile
Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "192--196",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2863281",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Whereas data persistence in non-volatile memories
(NVMs) enables instant data recovery (IDR) in the face
of power/system failures, it also exposes NVMs to data
confidentiality and integrity attacks. Counter mode
encryption and Merkle Tree authentication are
established measures to thwart data confidentiality and
integrity attacks, respectively, in NVMs. However,
these security mechanisms require high overhead atomic
security meta-data updates on every write-back in order
to support IDR in NVMs. This increases memory traffic
and negatively impacts system performance and memory
lifetime. Architecture for Secure Non-Volatile Memories
(ARSENAL) is an IDR-preserving, low cost, high
performance security solution that protects NVM systems
against data confidentiality and integrity attacks.
ARSENAL synergistically integrates (i) Smart Writes for
Faster Transactions (SWIFT), a novel technique to
reduce the performance overhead of atomic security
meta-data updates on every write-back, with (ii)
Terminal BMT Updates (TBU), a novel
BMT-consistency-preserving technique, to facilitate IDR
in the face of power/system failures. Our evaluations
show that on average, ARSENAL improves system
performance (measured in IPC) by 2.26x (4x), reduces
memory traffic overhead by 1.47x (1.88x), and improves
memory lifetime by 2x (3.5x) in comparison to
conventional IDR-preserving 64-bit (128-bit)
encryption+authentication.",
acknowledgement = ack-nhfb,
affiliation = "Mohanram, K (Reprint Author), Univ Pittsburgh, Dept
Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Swami,
Shivam; Mohanram, Kartik, Univ Pittsburgh, Dept Elect
\& Comp Engn, Pittsburgh, PA 15260 USA.",
author-email = "shs173@pitt.edu kmram@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "GT5EV",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-1217738]",
funding-text = "This research was supported by NSF Award CCF-1217738.
We also thank the editor and the reviewers for their
constructive comments that have helped us elaborate and
improve the content of the paper.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture for nonvolatile memories; ARSENAL;
authentication; Authentication; authentication;
cryptography; data confidentiality; data integrity;
data integrity attacks; data persistence; encryption;
Encryption; failure analysis; hardware security; high
overhead atomic security meta-data updates; high
performance security solution; IDR; IDR-preserving
encryption-authentication; instant data recovery;
integrated circuit reliability; memory architecture;
memory lifetime; Memory management; memory traffic
overhead; Non-volatile memories; Nonvolatile memory;
NVMs; power failures; Random access memory;
random-access storage; security mechanisms; smart
writes for faster transactions; SWIFT; system failures;
system performance; terminal BMT updates",
keywords-plus = "ENCRYPTION; PERFORMANCE",
number-of-cited-references = "28",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Swami:2018:AAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Basak:2018:ECC,
author = "Abanti Basak and Xing Hu and Shuangchen Li and Sang
Min Oh and Yuan Xie",
title = "Exploring Core and Cache Hierarchy Bottlenecks in
Graph Processing Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "197--200",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2864964",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Graph processing is an important analysis technique
for a wide range of big data problems. The ability to
explicitly represent relationships between entities
gives graph analytics significant performance advantage
over traditional relational databases. In this paper,
we perform an in-depth data-aware characterization of
graph processing workloads on a simulated multi-core
architecture, find bottlenecks in the core and the
cache hierarchy that are not highlighted by previous
characterization work, and analyze the behavior of the
specific application data type causing the
corresponding bottleneck. We find that load-load
dependency chains involving different application data
types form the primary bottleneck in achieving a high
memory-level parallelism in graph processing workloads.
We also observe that the private L2 cache has a
negligible contribution to performance. whereas the
shared L3 cache has higher performance sensitivity. In
addition, we present a study on the effectiveness of
several replacement policies. Finally, we study the
relationship between different graph algorithms and the
access volumes to the different data types. Overall, we
provide useful insights and guidelines toward
developing a more optimized CPU-based architecture for
high performance graph processing.",
acknowledgement = ack-nhfb,
affiliation = "Basak, A (Reprint Author), Univ Calif Santa Barbara,
Santa Barbara, CA 93106 USA. Basak, Abanti; Hu, Xing;
Li, Shuangchen; Oh, Sang Min; Xie, Yuan, Univ Calif
Santa Barbara, Santa Barbara, CA 93106 USA.",
author-email = "abasak@umail.ucsb.edu xinghu.cs@gmail.com
shuangchenli@ece.ucsb.edu sangminoh@umail.ucsb.edu
yuanxie@gmail.com",
da = "2019-06-20",
doc-delivery-number = "GT5EV",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[1730309/1719160/1500848]; CRISP, one of six centers in
JUMP, a Semiconductor Research Corporation program -
DARPA",
funding-text = "This work was supported in part by US National Science
Foundation 1730309/1719160/1500848 and by CRISP, one of
six centers in JUMP, a Semiconductor Research
Corporation program sponsored by DARPA.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application data type; Arrays; Benchmark testing; Big
Data; big data problems; Cache Hierarchy; cache
hierarchy bottlenecks; cache storage; CPU-based
architecture; graph algorithms; graph analytics; Graph
Processing; graph processing workloads; graph theory;
Guidelines; high performance graph processing; in-depth
data-aware characterization; Layout; load-load
dependency chains; mathematics computing; Memory-Level
Parallelism; memory-level parallelism; microprocessor
chips; multicore architecture; multiprocessing systems;
parallel architectures; performance evaluation;
performance sensitivity; private L2 cache; Random
access memory; Sensitivity; shared L3 cache",
number-of-cited-references = "13",
ORCID-numbers = "Oh, Sang Min/0000-0001-7119-6934",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Basak:2018:ECC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khatamifard:2018:NCC,
author = "S. Karen Khatamifard and Longfei Wang and Selcuk
K{\"o}se and Ulya R. Karpuzcu",
title = "A New Class of Covert Channels Exploiting Power
Management Vulnerabilities",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "201--204",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2860006",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Effective runtime power management requires hardware
activity to be tracked at a very fine granularity in
both space and time in order to meet diverse workload
performance requirements within a tight power budget.
As the available instantaneous power budget itself
represents a shared resource, this practically
translates into finding the optimal allocation of the
power budget among active tasks of execution. Covert
communication over a previously unexplored class of
channels thereby becomes possible, which forms the
focus of this paper.",
acknowledgement = ack-nhfb,
affiliation = "Khatamifard, SK (Reprint Author), Univ Minnesota,
Minneapolis, MN 55455 USA. Khatamifard, S. Karen;
Karpuzcu, Ulya R., Univ Minnesota, Minneapolis, MN
55455 USA. Wang, Longfei; Kose, Selcuk, Univ S Florida,
Tampa, FL 33620 USA.",
author-email = "khatami@umn.edu longfei@mail.usf.edu
ukarpuzc@umn.edu",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF CAREER Award [CCF-1350451]; NSF/SRC
[CNS-1715286]; Cisco Systems Research Award",
funding-text = "This work is supported in part by the NSF CAREER Award
under Grant CCF-1350451, in part by the NSF/SRC Award
under Grant CNS-1715286, and in part by the Cisco
Systems Research Award.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "covert channels; covert communication; diverse
workload performance requirements; fine granularity;
Hardware; hardware activity; instantaneous power
budget; Monitoring; optimal allocation; power aware
computing; Power demand; Power management
vulnerabilities; power management vulnerabilities;
Power system management; runtime power management;
security of data; Software; System-on-chip; tight power
budget; Voltage control",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Khatamifard:2018:NCC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kondguli:2018:BUS,
author = "Sushant Kondguli and Michael Huang",
title = "{Bootstrapping}: Using {SMT} Hardware to Improve
Single-Thread Performance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "205--208",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2859945",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Decoupled look-ahead (DLA) architectures have been
shown to be an effective way to improve single-thread
performance. However, a default implementation requires
an additional core. While an SMT flavor is possible, a
naive implementation is inefficient and thus slow. In
this paper, we propose an optimized implementation
called Bootstrapping that makes DLA just as effective
on a single (SMT) core as using two cores. While fusing
two cores can improve single-thread performance by
1.23x, Bootstrapping provides a speedup of 1.51.",
acknowledgement = ack-nhfb,
affiliation = "Kondguli, S (Reprint Author), Univ Rochester, Dept
Elect \& Comp Engn, Rochester, NY 14627 USA. Kondguli,
Sushant; Huang, Michael, Univ Rochester, Dept Elect \&
Comp Engn, Rochester, NY 14627 USA.",
author-email = "sushant.kondguli@rochester.edu
michael.huang@rochester.edu",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1514433, 1533842]",
funding-text = "This work is supported in part by NSF under grants
1514433 and 1533842.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bootstrapping; Computer architecture; Context;
Decoupled look-ahead (DLA) architectures; decoupled
look-ahead architectures; DLA architecture;
multi-threading; multiprocessing systems; optimisation;
optimized implementation; Prefetching; Resource
management; simultaneous multi-threading (SMT); single
core; single thread performance; single-thread
performance; Skeleton; SMT hardware; Substrates",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Kondguli:2018:BUS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kline:2018:CAR,
author = "Donald {Kline, Jr.} and Rami Melhem and Alex K.
Jones",
title = "Counter Advance for Reliable Encryption in Phase
Change Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "209--212",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2861012",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The use of hardware encryption and new memory
technologies such as phase change memory (PCM) are
gaining popularity in a variety of server applications
such as cloud systems. While PCM provides energy and
density advantages over conventional DRAM memory, it
faces endurance challenges. Such challenges are
exacerbated when employing memory encryption as the
stored data is essentially randomized. losing data
locality and reducing or eliminating the effectiveness
of energy and endurance aware encoding techniques. This
results in increasing dynamic energy consumption and
accelerated wear out. In this paper we propose counter
advance, a technique to leverage the process of
encryption to improve reliability and lifetime while
maintaining low-energy and low-latency operation.
Counter advance is compatible with standard
error-correction codes (ECC) and error correction
pointers (ECP), the standard for mitigating endurance
faults in PCM. Counter advance achieves the same fault
tolerance using three ECP pointers for a 10(-4) cell
failure rate compared to the leading approach to
consider energy savings and reliability for encrypted
PCM (SECRET) using five ECP pointers. At a failure rate
of 10(-2), counter advance can achieve an uncorrectable
bit error rate (UBER) of 10(-1), compared to < 10(-4)
for SECRET using six ECP pointers. This leads to a
lifetime improvement of 3.8x while maintaining
comparable energy consumption and access latency.",
acknowledgement = ack-nhfb,
affiliation = "Kline, D (Reprint Author), Univ Pittsburgh, Dept Elect
\& Comp Engn, Pittsburgh, PA 15260 USA. Kline, Donald,
Jr.; Jones, Alex K., Univ Pittsburgh, Dept Elect \&
Comp Engn, Pittsburgh, PA 15260 USA. Melhem, Rami, Univ
Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA.",
author-email = "dek61@pitt.edu melhem@cs.pitt.edu akjones@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1747452]; IUCRC Program of the
National Science Foundation [CNS-1738783]; SHREC",
funding-text = "This work was supported by NSF Graduate Research
Fellowship award number 1747452, and SHREC industry and
agency members and by the IUCRC Program of the National
Science Foundation (Grant No. CNS-1738783).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and error correction; Computer architecture; Emerging
memories; Encryption; error correction; Error
correction; Memory management; Microprocessors; Phase
change materials; reliability; stuck-at faults",
number-of-cited-references = "16",
oa = "Bronze",
ORCID-numbers = "Kline, Jr, Donald/0000-0002-4414-1513",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kline:2018:CAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sahoo:2018:RRD,
author = "Debiprasanna Sahoo and Swaraj Sha and Manoranjan
Satpathy and Madhu Mutyam",
title = "{ReDRAM}: a Reconfigurable {DRAM} Cache for {GPGPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "213--216",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2865552",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware-based DRAM cache techniques for GPGPUs
propose to use GPU DRAM as a cache of the host (system)
memory. However, these approaches do not exploit the
opportunity of allocating store-before-load data (data
that is written before being read by GPU cores) on GPU
DRAM that would save multiple CPU-GPU transactions. In
this context, we propose ReDRAM, a novel memory
allocation strategy for GPGPUs which re-configures GPU
DRAM cache as a heterogeneous unit. It allows
allocation of store-before-load data directly onto GPU
DRAM and also utilizes it as a cache of the host
memory. Our simulation results using a modified version
of GPGPU-Sim show that ReDRAM can improve performance
for applications that use store-before-load data by
57.6 percent (avg.) and 4.85x (max.) when compared to
the existing proposals on state-of-the-art GPU DRAM
caches.",
acknowledgement = ack-nhfb,
affiliation = "Sahoo, D (Reprint Author), Indian Inst Technol
Bhubaneswar, Bhubaneswar 751013, Odisha, India. Sahoo,
Debiprasanna; Sha, Swaraj; Satpathy, Manoranjan, Indian
Inst Technol Bhubaneswar, Bhubaneswar 751013, Odisha,
India. Mutyam, Madhu, Indian Inst Technol Madras,
Madras 600036, Tamil Nadu, India.",
author-email = "debiprasanna.sahoo@gmail.com ss24@iitbbs.ac.in
manoranjan@iitbbs.ac.in madhu@cse.iitm.ac.in",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arrays; cache storage; CPU-GPU communication; DRAM
cache; DRAM chips; efficiency 57.6 percent; GPGPU;
GPGPU-Sim; GPGPUs; GPU cores; GPU DRAM cache; graphics
processing units; Graphics processing units; Hardware;
hardware-based DRAM cache techniques; heterogeneous
unit; host memory; memory allocation strategy; Memory
management; multiple CPU-GPU transactions; Random
access memory; reconfigurable DRAM cache; ReDRAM;
resource allocation; Resource management;
store-before-load; store-before-load data allocation;
tagless",
number-of-cited-references = "16",
ORCID-numbers = "Mutyam, Madhu/0000-0003-1638-4195 Sahoo,
Debiprasanna/0000-0003-1438-0617",
research-areas = "Computer Science",
researcherid-numbers = "Mutyam, Madhu/B-1717-2012",
times-cited = "0",
unique-id = "Sahoo:2018:RRD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mashimo:2018:VMS,
author = "Susumu Mashimo and Ryota Shioya and Koji Inoue",
title = "{VMOR}: Microarchitectural Support for Operand Access
in an Interpreter",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "217--220",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2866243",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Dynamic scripting languages become very popular for
high productivity. However, many of these languages
have significant runtime overheads because they employ
interpreter-based virtual machines. One of the major
overheads for the interpreter is derived from operand
accesses, which significantly increase memory accesses.
We propose VMOR, microarchitectural support for the
operand accesses in the interpreter. VMOR remaps
operand values into floating-point physical registers,
which are rarely used in the interpreter, and thus.
VMOR effectively reduces the memory accesses.",
acknowledgement = ack-nhfb,
affiliation = "Mashimo, S (Reprint Author), Kyushu Univ, Fukuoka,
Fukuoka 8190395, Japan. Mashimo, Susumu; Inoue, Koji,
Kyushu Univ, Fukuoka, Fukuoka 8190395, Japan. Shioya,
Ryota, Nagoya Univ, Nagoya, Aichi 4648601, Japan.",
author-email = "susumu.mashimo@cpc.ait.kyushu-u.ac.jp
shioya@nuee.nagoya-u.ac.jp inoue@ait.kyushu-u.ac.jp",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "JSPS KAKENHI [JP17J10388]",
funding-text = "This work was supported by JSPS KAKENHI Grant Number
JP17J10388.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "authoring languages; Cryptography; Dynamic scripting
language; dynamic scripting languages; floating-point
physical registers; Hardware; high productivity;
interpreter; interpreter-based virtual machines; memory
accesses; microarchitectural support;
Microarchitecture; operand access; operand values;
Pipelines; Productivity; program interpreters;
Proposals; Registers; virtual machines; VMOR",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Mashimo:2018:VMS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Min:2018:SCD,
author = "Seungwon Min and Mohammad Alian and Wen-Mei Hwu and
Nam Sung Kim",
title = "Semi-Coherent {DMA}: an Alternative {I/O} Coherency
Management for Embedded Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "221--224",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2866568",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Many modern embedded CPUs adopt Non-Coherent DMA
(NC-DMA) over Coherent DMA (C-DMA) because of
simplicity. An NC-DMA design, however, requires a CPU
device driver to explicitly invalidate or flush a wide
range of cache space. When an I/O DMA device writes
data to a main memory region, the CPU needs to
invalidate the cache space corresponding to the same
memory region twice: (1) to prevent dirty cache lines
from overwriting the DMA data and (2) to remove any
cache lines prefetched before the DMA is done. In this
work, we first show that such explicit invalidations
consume 31 percent of CPU cycles, limiting the data
transfer throughput of a high-speed network interface
card (NIC) when receiving network packets. Second, we
propose a Semi-Coherent DMA (SC-DMA) architecture for
improving the efficiency of NC-DMA with a slight
modification to the hardware. Specifically, our SC-DMA
records the DMA region and prohibits any data that is
prefetched from the region from entering the cache,
reducing nearly 50 percent of the unnecessary
invalidations. Lastly, we identify several software
optimizations that can substantially reduce excessive
cache invalidations prevalent in NIC drivers. Our
evaluation with NVIDIA Jetson TX2 shows that our
proposed SC-DMA design with the NIC driver
optimizations can improve the NIC data transfer
throughput by up to 53.3 percent.",
acknowledgement = ack-nhfb,
affiliation = "Kim, NS (Reprint Author), Univ Illinois, Elect \& Comp
Engn, Urbana, IL 61820 USA. Min, Seungwon; Alian,
Mohammad; Hwu, Wen-Mei; Kim, Nam Sung, Univ Illinois,
Elect \& Comp Engn, Urbana, IL 61820 USA.",
author-email = "min16@illinois.edu malian2@illinois.edu
w-hwu@illinois.edu nskim@illinois.edu",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "SRC/JUMP Applications Driving Architectures
(ADA) Research Center; IBM-ILLI-NOIS Center for
Cognitive Computing Systems Research (C3SR)",
funding-text = "This work is supported in part by grants from SRC/JUMP
Applications Driving Architectures (ADA) Research
Center and IBM-ILLI-NOIS Center for Cognitive Computing
Systems Research (C3SR).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; C-DMA; Cache; cache lines; cache space;
cache storage; coherency management; coherent DMA;
Computer architecture; CPU cycles; CPU device driver;
Data transfer; device drivers; Device drivers; DMA
data; DMA device; DMA region; embedded CPUs; embedded
processor; embedded systems; Embedded systems; embedded
systems; Ethernet; excessive cache invalidations;
Hardware; high-speed network interface card; Internet
of Things; main memory region; microprocessor chips;
multiprocessing systems; NC-DMA design; NIC data
transfer throughput; noncoherent DMA; Prefetching;
SC-DMA design; SC-DMA records; semicoherent DMA
architecture",
number-of-cited-references = "16",
ORCID-numbers = "Min, Seung Won/0000-0001-7195-7182",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Min:2018:SCD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nematollahi:2018:NSD,
author = "Negin Nematollahi and Mohammad Sadrosadati and Hajar
Falahati and Marzieh Barkhordar and Hamid
Sarbazi-Azad",
title = "{Neda}: Supporting Direct Inter-Core Neighbor Data
Exchange in {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "225--229",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2873679",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Image processing applications employ various fitters
for several purposes, such as enhancing the images and
extracting the features. Recent studies show that
filters in image processing applications take a
substantial amount of the execution time. and it is
crucial to boost their performance to improve the
overall performance of the image processing
applications. Image processing filters require a
significant amount of data sharing among threads which
are in charge of filtering neighbor pixels. Graphics
Processing Units (GPUs) attempt to satisfy the demand
of data sharing by providing the scratch-pad memory,
shuffle instructions, and on-chip caches. However, we
observe that these mechanisms are insufficient to
provide a fast and energy-efficient neighbor data
sharing for the image processing filters. In this
paper, we propose a new hardware/software co-design
mechanism for GPUs, to effectively provide a fast and
energy-efficient register-level neighbor data sharing
for the image fitters. We propose a neighbor data
exchange mechanism. called Neda, that adds a register
to each streaming processor (SP) which can be accessed
by its neighboring SPs. Our experimental results show
that Neda improves the performance and energy
consumption by 12.4 and 13.5 percent, on average,
respectively, compared to the NVIDIA SDK implementation
of image processing filters. Moreover, Neda's
performance is within 9.3 percent of the ideal GPU with
zero latency neighbor data exchange capability.",
acknowledgement = ack-nhfb,
affiliation = "Nematollahi, N (Reprint Author), Sharif Univ Technol,
Dept Comp Engn, Tehran 111559517, Iran. Nematollahi,
Negin; Sadrosadati, Mohammad; Barkhordar, Marzieh;
Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp
Engn, Tehran 111559517, Iran. Falahati, Hajar;
Sarbazi-Azad, Hamid, Inst Res Fundamental Sci, Comp Sci
Sch, Tehran 193955531, Iran.",
author-email = "negin.mahani@gmail.com m.sadr89@gmail.com
hfalahati@ipm.ir marzieh.barkhordar@gmail.com
azad@sharif.edu",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; direct inter-core neighbor data
exchange mechanism; efficiency 13.5 percent; efficiency
9.3 percent; electronic data interchange; energy
consumption; energy-efficient neighbor data sharing;
energy-efficient register-level neighbor data sharing;
fast energy-efficient neighbor data; feature
extraction; GPUs; Graphics processing units; graphics
processing units; hardware-software co-design
mechanism; hardware-software codesign; image
enhancement; image filtering; image filters; Image
processing; image processing applications; image
processing filters; Instruction sets; inter-core
communication; Microsoft Windows; Neda; neighbor data
exchange; NVIDIA SDK implementation; on-chip caches;
Registers; scratch-pad memory; shuffle instructions;
spatial image processing filters; streaming processor;
Two dimensional displays; zero latency neighbor data
exchange capability",
keywords-plus = "MEAN FILTERS; IMAGE; DOMAIN",
number-of-cited-references = "40",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Nematollahi:2018:NSD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Omar:2018:MRI,
author = "Hamza Omar and Halit Dogan and Brian Kahne and Omer
Khan",
title = "Multicore Resource Isolation for Deterministic,
Resilient and Secure Concurrent Execution of
Safety-Critical Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "230--234",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2874216",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multicores increasingly deploy spatial execution of
safety-critical applications that demand a
deterministic, resilient, and secure environment to
meet the safety standards. However, multicores
aggressively share hardware resources that leads to
non-deterministic performance due to destructive
interference from concurrent applications. Resource
sharing not only hinders efficient resilient execution,
but also introduces security vulnerabilities due to
information leakage on side-channels. This work
proposes a novel multicore framework that constructs
isolated clusters of cores for each concurrent
application. It guarantees concurrent applications with
deterministic performance, as well as an efficient
execution environment for resiliency and security.
Moreover, the framework allows dynamic re-sizing of
cluster sizes for load balanced execution of concurrent
applications. However, it leads to diminished isolation
between clusters, which opens various
performance-resilience and performance-security
tradeoffs.",
acknowledgement = ack-nhfb,
affiliation = "Khan, O (Reprint Author), Univ Connecticut, Dept Elect
\& Comp Engn, Storrs, CT 06269 USA. Omar, Hamza; Dogan,
Halit; Khan, Omer, Univ Connecticut, Dept Elect \& Comp
Engn, Storrs, CT 06269 USA. Kahne, Brian, NXP Semicond
Inc, Automot Microcontrollers \& Processors, Austin, TX
78735 USA.",
author-email = "hamza.omar@uconn.edu halit.dogan@uconn.edu
brian.kahne@nxp.com omer.khan@uconn.edu",
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-1550470,
CNS-1718481]",
funding-text = "This research was partially supported by the National
Science Foundation under Grants No. CCF-1550470 and
CNS-1718481.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "concurrency control; concurrent application;
deterministic performance; Hardware; hardware resource
sharing; hardware resources; Interference; load
balanced execution; Multicore; multicore framework;
Multicore processing; multicore resource isolation;
multicores; multiprocessing systems; nondeterministic
performance; Program processors; resilience;
Resilience; resilience; resource allocation;
safety-critical applications; safety-critical systems;
secure environment; security; Security; security;
security of data; security vulnerabilities;
side-channels; spatial execution; System-on-chip",
number-of-cited-references = "20",
ORCID-numbers = "Khan, Omer/0000-0001-6293-7403",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Omar:2018:MRI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zokaee:2018:APM,
author = "Farzaneh Zokaee and Hamid R. Zarandi and Lei Jiang",
title = "{AligneR}: a Process-in-Memory Architecture for Short
Read Alignment in {ReRAMs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "235--238",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2854700",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Genomics is the key to enable the personal
customization of medical care. How to fast and
energy-efficiently analyze the huge amounts of genomic
sequence data generated by next generation sequencing
technologies has become one of the most significant
challenges facing genomics today. Existing hardware
platforms achieve low genome sequencing throughput with
significant hardware and power overhead. In this paper,
we propose AligneR, a ReRAM-based process-in-memory
architecture, to accelerate the bottleneck of genome
sequencing, i.e., short read alignment. Compared to
state-of-the-art accelerators, AligneR improves the
short read alignment throughput per Watt per mm(2) by
13x.",
acknowledgement = ack-nhfb,
affiliation = "Zokaee, F (Reprint Author), Indiana Univ, Bloomington,
IN 47405 USA. Zokaee, Farzaneh; Jiang, Lei, Indiana
Univ, Bloomington, IN 47405 USA. Zokaee, Farzaneh;
Zarandi, Hamid R., Amirkabir Univ Technol, Tehran
158754413, Iran.",
author-email = "f\_zokaee@aut.ac.ir h\_zarandi@aut.ac.ir
jiang60@iu.edu",
da = "2019-06-20",
doc-delivery-number = "HE6YC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bioinformatics; Computer architecture; FM-index;
Genome sequencing; Genomics; Memory management;
Microprocessors; process-in-memory; Random access
memory; ReRAM; Sequential analysis; short read
alignment; Throughput",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Zokaee:2018:APM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
xxpages = "237--240",
}
@Article{Lou:2018:BSB,
author = "Qian Lou and Lei Jiang",
title = "{BRAWL}: a Spintronics-Based Portable
Basecalling-in-Memory Architecture for Nanopore Genome
Sequencing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "239--242",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2882384",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Nanopore sequencing is one of the most promising
genome sequencing technologies because of its ability
to produce ultra long reads and provide portability.
Basecalling, the most time-consuming step in the whole
flow of Nanopore genome sequencing, translates analog
signals to digital DNA symbols. The state-of-the-art
basecaller relies on a complex neural network
consisting of convolutional, long short-term memory and
fully-connected layers, and a CTC decoder. Existing
neural network portable accelerators achieve low
basecalling throughput per Watt when processing such
neural network inferences. In this paper, we propose
BRAWL, a portable Basecalling-in-memory architecture,
to translate RAW electrical signals to digital DNA
symbols in SOT-MRAMs for Nanopore portable sequencers.
Compared to state-of-the-art accelerators, BRAWL
improves basecalling throughput per Watt by 3: 88x.",
acknowledgement = ack-nhfb,
affiliation = "Jiang, L (Reprint Author), Indiana Univ, Bloomington,
IN 47405 USA. Lou, Qian; Jiang, Lei, Indiana Univ,
Bloomington, IN 47405 USA.",
author-email = "louqian@iu.edu jiang60@iu.edu",
da = "2019-06-20",
doc-delivery-number = "HE6YC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Artificial neural networks; basecaller;
Bioinformatics; Computer architecture; DNA; genome
sequencing; Genomics; Microprocessors; Oxford nanopore
technology; process-in-memory; Sequential analysis;
SOT-MRAM",
keywords-plus = "PERFORMANCE; ENERGY",
number-of-cited-references = "26",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lou:2018:BSB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
xxpages = "241--244",
}
@Article{Min:2018:AAB,
author = "Donghyun Min and Donggyu Park and Jinwoo Ahn and Ryan
Walker and Junghee Lee and Sungyong Park and Youngjae
Kim",
title = "{Amoeba}: an Autonomous Backup and Recovery {SSD} for
Ransomware Attack Defense",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "243--246",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2883431",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Ransomware is one of growing concerns in enterprise
and government organizations, because it may cause
financial damages or loss of important data. Although
there are techniques to detect and prevent ransomware,
an evolved ransomware may evade them because they are
based on monitoring known behaviors. Ransomware can be
mitigated if backup copies of data are retained in a
safe place. However, existing backup solutions may be
under ransomware's control and an intelligent
ransomware may destroy backup copies too. They also
incur overhead to storage space, performance and
network traffic (in case of remote backup). In this
paper, we propose an SSD system that supports automated
backup, called Amoeba. In particular, Amoeba is armed
with a hardware accelerator that can detect the
infection of pages by ransomware attacks at high speed
and a fine-grained backup control mechanism to minimize
space overhead for original data backup. For
evaluation, we extended the Microsoft SSD simulator to
implement Amoeba and evaluated it using the realistic
block-level traces, which are collected while running
the actual ransomware. According to our experiments,
Amoeba has negligible overhead and outperforms in
performance and space efficiency over the
state-of-the-art SSD, FlashGuard, which supports data
backup within the device.",
acknowledgement = ack-nhfb,
affiliation = "Kim, Y (Reprint Author), Sogang Univ, Seoul 04107,
South Korea. Min, Donghyun; Park, Donggyu; Ahn, Jinwoo;
Park, Sungyong; Kim, Youngjae, Sogang Univ, Seoul
04107, South Korea. Walker, Ryan; Lee, Junghee, Univ
Texas San Antonio, San Antonio, TX 78249 USA.",
author-email = "mdh38112@sogang.ac.kr dgpark@sogang.ac.kr
jinu37@sogang.ac.kr ryan.walker@utsa.edu
junghee.lee@utsa.edu parksy@sogang.ac.kr
youkim@sogang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "HE6YC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea (NRF)
--- Korea Government (MSIT) [NRF-2018R1A1A1A05079398]",
funding-text = "This work was supported by the National Research
Foundation of Korea (NRF) grant funded by the Korea
Government (MSIT) (No. NRF-2018R1A1A1A05079398).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Aerospace electronics; Amoeba; autonomous backup SSD;
autonomous recovery SSD; back-up procedures;
cryptography; Cryptography; data backup; Entropy;
FlashGuard; intelligent ransomware; invasive software;
Microsoft SSD simulator; Performance evaluation;
Ransomware; ransomware attack; ransomware attack
defense; Solid-state drive (SSD); SSD system; storage
management; storage security",
number-of-cited-references = "12",
ORCID-numbers = "Park, Sungyong/0000-0002-0309-1820 Min,
Donghyun/0000-0002-6043-9264",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Min:2018:AAB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
xxpages = "245--248",
}
@Article{Kim:2018:HBP,
author = "Chinam Kim and Hyukjun Lee",
title = "A High-Bandwidth {PCM}-Based Memory System for Highly
Available {IP} Routing Table Lookup",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "17",
number = "2",
pages = "246--249",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2883461",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Achieving higher availability is an unending challenge
in router architecture, as process technology scales
down and more random logic/memory errors must be
tolerated. However, meeting extremely high targets that
require only few seconds of yearly downtime puts even
more pressure on the design of already complex router
architecture. In this paper, we explore the case of
storing the routing table in non-volatile memory, to
drastically reduce the router downtime and achieve
higher availability-without degrading lookup
performance. We propose a new MLC PCM architecture,
featuring decoupled node access and logically managed
duplicate bank groups, that fetches the right amount of
information from the most available bank. Performance
evaluation shows that we achieve an average of 9.9
percent bandwidth improvement over the DRAM baseline
system, and an 83.9 percent over the PCM baseline.",
acknowledgement = ack-nhfb,
affiliation = "Lee, H (Reprint Author), Sogang Univ, Dept Comp Sci \&
Engn, Seoul 04107, South Korea. Kim, Chinam; Lee,
Hyukjun, Sogang Univ, Dept Comp Sci \& Engn, Seoul
04107, South Korea.",
author-email = "chinamkim@sogang.ac.kr hyukjunl@sogang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "HE6YC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Samsung Electronics",
funding-text = "This research is funded by Samsung Electronics. The
corresponding author is Hyukjun Lee.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; complex router architecture; decoupled node
access; DRAM baseline system; DRAM chips; duplicate
bank groups; high-bandwidth PCM-based memory system;
highly available IP routing table lookup; IP networks;
IP routing table lookup; MLC PCM architecture; Network
architecture; nonvolatile memory; PCM baseline; Phase
change materials; phase change memories; Phase change
memory; process technology; processing-in-memory;
Random access memory; random logic errors; random
memory errors; router downtime reduction; Routing;
table lookup; Table lookup; telecommunication network
routing",
number-of-cited-references = "13",
ORCID-numbers = "Kim, Chinam/0000-0002-7984-2643",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2018:HBP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
xxpages = "247--250",
}
@Article{Kim:2019:IGM,
author = "Jiho Kim and Jehee Cha and Jason Jong Kyu Park and
Dongsuk Jeon and Yongjun Park",
title = "Improving {GPU} Multitasking Efficiency Using Dynamic
Resource Sharing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "1--5",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2889042",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As GPUs have become essential components for embedded
computing systems, a shared GPU with multiple CPU cores
needs to efficiently support concurrent execution of
multiple different applications. Spatial multitasking,
which assigns a different amount of streaming
multiprocessors (SMs) to multiple applications, is one
of the most common solutions for this. However, this is
not a panacea for maximizing total resource
utilization. It is because an SM consists of many
different sub-resources such as caches, execution units
and scheduling units, and the requirements of the
sub-resources per kernel are not well matched to their
fixed sizes inside an SM. To solve the resource
requirement mismatch problem, this paper proposes a GPU
Weaver, a dynamic sub-resource management system of
multitasking GPUs. GPU Weaver can maximize sub-resource
utilization through a shared resource controller (SRC)
that is added between neighboring SMs. The SRC
dynamically identifies idle sub-resources of an SM and
allows them to be used by the neighboring SM when
possible. Experiments show that the combination of
multiple sub-resource borrowing techniques enhances the
total throughput by up to 26 and 9.5 percent on average
over the baseline spatial multitasking GPU.",
acknowledgement = ack-nhfb,
affiliation = "Park, Y (Reprint Author), Hanyang Univ, Seoul 04763,
South Korea. Kim, Jiho; Cha, Jehee, Hongik Univ, Seoul
04066, South Korea. Park, Jason Jong Kyu, Univ
Michigan, Ann Arbor, MI 48109 USA. Jeon, Dongsuk, Seoul
Natl Univ, Seoul 151742, South Korea. Park, Yongjun,
Hanyang Univ, Seoul 04763, South Korea.",
author-email = "jihokimhi@gmail.com carjehee@gmail.com
jasonjk@umich.edu djeon1@snu.ac.kr
yongjunpark@hanyang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "HI0TZ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea (NRF)
--- Korea government (MSIP) [NRF-2015R1C1A1A01053844,
NRF-2016R1C1B2016072]; ICT R\&D program of MSIP/IITP
[2017-0-00142]; R\&D program of MOTIE/KEIT [10077609]",
funding-text = "This work was supported in part by the National
Research Foundation of Korea (NRF) grant funded by the
Korea government (MSIP) (NO. NRF-2015R1C1A1A01053844,
NO. NRF-2016R1C1B2016072), ICT R\&D program of
MSIP/IITP (No. 2017-0-00142), and the R\&D program of
MOTIE/KEIT (No. 10077609).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; dynamic resource sharing;
dynamic sub-resource management system; embedded
computing systems; embedded systems; GPU multitasking
efficiency; GPU Weaver; GPUs; graphics processing
units; Graphics processing units; Instruction sets;
Kernel; Micromechanical devices; multi-programmed;
multiple CPU cores; multiple sub-resource borrowing
techniques; multiprogramming; Multitasking;
multitasking GPUs; resource allocation; Resource
management; resource requirement mismatch problem;
resource sharing; scheduling; scheduling units; shared
GPU; shared resource controller; spatial multitasking;
SRC; streaming multiprocessors; sub-resource
utilization; total resource utilization; Weaving",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2019:IGM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2019:IIC,
author = "Anonymous",
title = "2018 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 17",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "1--8",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2901240",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the 2018 subject/author index for this
publication.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Xu:2019:PFD,
author = "Sheng Xu and Xiaoming Chen and Ying Wang and Yinhe Han
and Xuehai Qian and Xiaowei Li",
title = "{PIMSim}: a Flexible and Detailed Processing-in-Memory
Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "6--9",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2885752",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "With the advent of big data applications and new
process technologies, Process-in-Memory (PIM) attracts
much attention in memory research as the architecture
studies gradually shift from processors to
heterogeneous aspects. How to achieve reliable and
efficient PIM architecture modeling becomes
increasingly urgent for the researchers, who want to
experiment on critical issues from detailed
implementations of their proposed PIM designs. This
paper proposes PIMSim, a full-system and
highly-configurable PIM simulator to facilitate
circuit-, architecture- and system-level researches.
PIMSim enables architectural simulation of PIM and
implements three simulation modes to provide a wide
range of speed/accuracy tradeoffs. It offers detailed
performance and energy models to simulate PIM-enabled
instructions, compiler, in-memory processing logic,
various memory devices, and PIM coherence. PIMSim is
open source and available at
https://github.com/vineodd/PIMSim.",
acknowledgement = ack-nhfb,
affiliation = "Xu, S (Reprint Author), Chinese Acad Sci, Inst Comp
Technol, Beijing, Peoples R China. Xu, Sheng; Chen,
Xiaoming; Wang, Ying; Han, Yinhe; Li, Xiaowei, Chinese
Acad Sci, Inst Comp Technol, Beijing, Peoples R China.
Xu, Sheng; Li, Xiaowei, Univ Chinese Acad Sci, Beijing
101408, Peoples R China. Qian, Xuehai, Univ Southern
Calif, Los Angeles, CA 90007 USA.",
author-email = "xusheng02@ict.ac.cn chenxiaoming@ict.ac.cn
wangying2009@ict.ac.cn yinhes@ict.ac.cn
xuehai.qian@usc.edu lxw@ict.ac.cn",
da = "2019-06-20",
doc-delivery-number = "HI0TZ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Natural Science Foundation of
China (NSFC) [61522406, 61834006, 61521092]; Beijing
Municipal Science \& Technology Commission
[Z171100000117019, Z181100008918006]; Strategic
Priority Research Program of the Chinese Academy of
Sciences [XDPB12]; Innovative Project of Institute of
Computing Technology, CAS [5120186140]",
funding-text = "This work was supported in part by National Natural
Science Foundation of China (NSFC) under grants
61522406, 61834006, and 61521092, Beijing Municipal
Science \& Technology Commission (Z171100000117019,
Z181100008918006), Strategic Priority Research Program
of the Chinese Academy of Sciences (XDPB12), and an
Innovative Project of Institute of Computing
Technology, CAS, under Grant 5120186140.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural simulation; Big Data; big data
applications; Coherence; Computational modeling;
Computer architecture; Data models; energy models;
heterogeneous aspects; heterogeneous computing;
in-memory processing logic; Kernel; memory
architecture; memory devices; memory research; memory
system; performance evaluation; PIM coherence; PIM
designs; PIM simulator; PIM-enabled instructions;
PIMSim; Process-in-Memory; Processing-in-memory;
processing-in-memory simulator; Program processors;
reliable PIM architecture modeling; simulation modes;
simulator; system-level researches; Tools",
number-of-cited-references = "22",
ORCID-numbers = "Wang, Ying/0000-0001-5172-4736",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Xu:2019:PFD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Shomron:2019:SCV,
author = "Gil Shomron and Uri Weiser",
title = "Spatial Correlation and Value Prediction in
Convolutional Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "10--13",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2890236",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Convolutional neural networks (CNNs) are a widely used
form of deep neural networks, introducing
state-of-the-art results for different problems such as
image classification, computer vision tasks, and speech
recognition. However, CNNs are compute intensive,
requiring billions of multiply-accumulate (MAC)
operations per input. To reduce the number of MACs in
CNNs, we propose a value prediction method that
exploits the spatial correlation of zero-valued
activations within the CNN output feature maps, thereby
saving convolution operations. Our method reduces the
number of MAC operations by 30.4 percent, averaged on
three modern CNNs for ImageNet, with top-1 accuracy
degradation of 1.7 percent, and top-5 accuracy
degradation of 1.1 percent.",
acknowledgement = ack-nhfb,
affiliation = "Shomron, G (Reprint Author), Technion Israel Inst
Technol, IL-3200003 Haifa, Israel. Shomron, Gil;
Weiser, Uri, Technion Israel Inst Technol, IL-3200003
Haifa, Israel.",
author-email = "gilsho@tx.technion.ac.il
uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "HI0TZ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "CNNs; computer vision; computer vision tasks;
Convolution; convolutional neural nets; convolutional
neural network; convolutional neural networks;
Correlation; Deep neural networks; deep neural
networks; Degradation; image classification; ImageNet;
learning (artificial intelligence); MAC operations;
Microsoft Windows; multiply-accumulate operations;
Neural networks; Predictive models; spatial
correlation; speech recognition; value prediction;
value prediction method; zero-valued activations",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Shomron:2019:SCV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gupta:2019:DQL,
author = "Ujjwal Gupta and Sumit K. Mandal and Manqing Mao and
Chaitali Chakrabarti and Umit Y. Ogras",
title = "A Deep {Q}-Learning Approach for Dynamic Management of
Heterogeneous Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "14--17",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2892151",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Heterogeneous multiprocessor system-on-chips (SoCs)
provide a wide range of parameters that can be managed
dynamically. For example, one can control the type
(big/little), number and frequency of active cores in
state-of-the-art mobile processors at runtime. These
runtime choices lead to more than 10$ \times $ range in
execution time, 5$ \times $ range in power consumption,
and 50$ \times $ range in performance per watt.
Therefore, it is crucial to make optimum power
management decisions as a function of dynamically
varying workloads at runtime. This paper presents a
reinforcement learning approach for dynamically
controlling the number and frequency of active big and
little cores in mobile processors. We propose an
efficient deep Q-learning methodology to optimize the
performance per watt (PPW). Experiments using Odroid
XU3 mobile platform show that the PPW achieved by the
proposed approach is within 1 percent of the optimal
value obtained by an oracle.",
acknowledgement = ack-nhfb,
affiliation = "Mandal, SK (Reprint Author), Arizona State Univ, Sch
Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta,
Ujjwal; Mandal, Sumit K.; Mao, Manqing; Chakrabarti,
Chaitali; Ogras, Umit Y., Arizona State Univ, Sch Elect
Comp \& Energy Engn, Tempe, AZ 85281 USA.",
author-email = "ujjwal@asu.edu skmandal@asu.edu mmao7@asu.edu
chaitali@asu.edu umit@asu.edu",
da = "2019-06-20",
doc-delivery-number = "HL5MF",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CNS-1526562]; Semiconductor Research
Corp. [2721.001]",
funding-text = "This work was supported by NSF grant CNS-1526562 and
Semiconductor Research Corp. task 2721.001.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "active cores; deep Q-learning approach; Deep
reinforcement learning; dynamic management; execution
time; Frequency control; Heterogeneous multi-cores;
heterogeneous processors; Instruments; learning
(artificial intelligence); Memory management; mobile
computing; mobile processors; multiprocessing systems;
multiprocessor system-on-chips; Odroid XU3 mobile
platform show; optimum power management decisions;
power aware computing; power consumption; Power demand;
Power management; Power system management; PPW;
reinforcement learning approach; Runtime; SoCs;
system-on-chip; Training",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Gupta:2019:DQL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rogers:2019:SLB,
author = "Samuel Rogers and Joshua Slycord and Ronak Raheja and
Hamed Tabkhi",
title = "Scalable {LLVM}-Based Accelerator Modeling in gem5",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "18--21",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2893932",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/python.bib",
abstract = "This article proposes a scalable integrated system
architecture modeling for hardware accelerator based in
gem5 simulation framework. The core of proposed
modeling is a LLVM-based simulation engine for modeling
any customized data-path with respect to inherent
data/instruction-level parallelism (derived by
algorithms) and available compute units (defined by the
user). The simulation framework also offers a
general-purpose communication interface that allows a
scalable and flexible connection into the gem5
ecosystem. Python API of gem5, enabling modifications
to the system hierarchy without the need to rebuild the
underlying simulator. Our simulation framework
currently supports full-system simulation (both
bare-metal and a full Linux kernel) for ARM-based
systems, with future plans to add support for RISC-V.
The LLVM-based modeling and modular integration to gem5
allow long-term simulation expansion and sustainable
design modeling for emerging applications with demands
for acceleration.",
acknowledgement = ack-nhfb,
affiliation = "Rogers, S (Reprint Author), Univ Noth Carolina, Dept
Elect \& Comp Engn, Charlotte, NC 28223 USA. Rogers,
Samuel; Slycord, Joshua; Raheja, Ronak; Tabkhi, Hamed,
Univ Noth Carolina, Dept Elect \& Comp Engn, Charlotte,
NC 28223 USA.",
author-email = "sroger48@uncc.edu jslycord@uncc.edu rraheja@uncc.edu
htabkhiv@uncc.edu",
da = "2019-06-20",
doc-delivery-number = "HL5MF",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application program interfaces; ARM-based systems;
Computational modeling; Computer architecture
simulation; customized data-path; Engines; field
programmable gate arrays; flexible connection;
full-system simulation; gem5 ecosystem; gem5 simulation
framework; general-purpose communication interface;
Hardware; hardware accelerator; hardware accelerators;
heterogeneous systems; inherent data; instruction-level
parallelism; Linux; LLVM-based modeling; LLVM-based
simulation engine; logic design; long-term simulation
expansion; microprocessor chips; multiprocessing
systems; parallel architectures; parallel programming;
program compilers; reduced instruction set computing;
Registers; RISC-V; Runtime; scalable connection;
scalable integrated system architecture modeling;
scalable LLVM-based accelerator modeling; Space
exploration; sustainable design modeling;
Synchronization; system hierarchy",
number-of-cited-references = "11",
ORCID-numbers = "Slycord, Joshua/0000-0002-0569-4094 Rogers,
Samuel/0000-0002-9697-2933",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Rogers:2019:SLB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Akin:2019:CAP,
author = "Berkin Akin and Alaa R. Alameldeen",
title = "A Case For Asymmetric Processing in Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "22--25",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2894800",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "By sidestepping the limitations at the memory
interface, processing-in-memory (PIM) unlocks
internally available memory bandwidth to the compute
units on the memory side. This abundant bandwidth is
conventionally utilized by highly-parallel
throughput-oriented many-core style PIM architectures
via offloading bandwidth-bound parallel tasks. However,
it can be difficult to fully isolate these PIM-suitable
tasks, and an offloaded program may include
compute-bound sequential phases. These PIM-averse
phases constitute a critical performance bottleneck for
conventional many-core style PIM architectures. In this
paper, we propose an analytical model for PIM execution
that considers a program's bandwidth demand as well as
its parallelism. Based on the proposed model, we make a
case for an asymmetric PIM architecture that can
mitigate the performance bottlenecks for PIM-averse
phases while keeping the performance upside for
PIM-suitable phases.",
acknowledgement = ack-nhfb,
affiliation = "Akin, B (Reprint Author), Intel Labs, Hillsboro, OR
97124 USA. Akin, Berkin; Alameldeen, Alaa R., Intel
Labs, Hillsboro, OR 97124 USA.",
author-email = "berkin.akin@intel.com alaa.r.alameldeen@intel.com",
da = "2019-06-20",
doc-delivery-number = "HL5MF",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; analytical performance model;
asymmetric multicore; asymmetric PIM architecture;
asymmetric processing; Bandwidth; bandwidth-bound
parallel tasks; Computational modeling; compute-bound
sequential phases; critical performance bottleneck;
memory bandwidth; memory interface; microprocessor
chips; Multicore processing; multiprocessing systems;
parallel processing; performance evaluation; PIM
execution; PIM-averse phases; PIM-suitable tasks;
Processing in memory; processing-in-memory; Silicon;
Task analysis; throughput-oriented many-core style
PIM",
keywords-plus = "AMDAHLS LAW",
number-of-cited-references = "9",
ORCID-numbers = "Akin, Berkin/0000-0001-6908-5581",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Akin:2019:CAP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tovletoglou:2019:SIH,
author = "Konstantinos Tovletoglou and Lev Mukhanov and
Dimitrios S. Nikolopoulos and Georgios Karakonstantis",
title = "{Shimmer}: Implementing a Heterogeneous-Reliability
{DRAM} Framework on a Commodity Server",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "26--29",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2893189",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we present the implementation of a
heterogeneous-reliability DRAM framework, Shimmer, on a
commodity server with a fully fledged OS. Shimmer
enables splitting of DRAM into multiple domains with
varying reliability and allocation of data depending on
their criticality. Compared to existing studies which
use simulators, we consider practical restrictions
stemming from the real hardware and investigate methods
to overcome them. In particular, we reveal that the
implementation of the heterogeneous-reliability memory
framework requires disabling of the hardware memory
interleaving, which results in a significant
degradation of the system performance. To overcome the
induced performance loss, we develop a software-based
interleaving. We evaluate the performance, power and
energy of the server using 35 benchmarks across three
memory configurations: the baseline configuration; with
disabled hardware memory interleaving and Shimmer with
software-based memory interleaving. Our results show
that Shimmer introduces a minor 6\% performance
overhead, while reducing the average DRAM power by
19.9\% when memory operates under relaxed refresh rate
and lowered memory supply voltage. As one of our main
contributions we demonstrate that a
heterogeneous-reliability framework based on Shimmer
can be realized on a commodity server and save 9.1\% of
the total processor and memory energy.",
acknowledgement = ack-nhfb,
affiliation = "Tovletoglou, K (Reprint Author), Queens Univ Belfast,
Belfast BT7 1NN, Antrim, North Ireland. Tovletoglou,
Konstantinos; Mukhanov, Lev; Nikolopoulos, Dimitrios
S.; Karakonstantis, Georgios, Queens Univ Belfast,
Belfast BT7 1NN, Antrim, North Ireland.",
author-email = "ktovletoglou01@qub.ac.uk l.mukhanov@qub.ac.uk
d.nikolopoulos@qub.ac.uk g.karakonstantis@qub.ac.uk",
da = "2019-06-20",
doc-delivery-number = "HL5WL",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Union [688540]",
funding-text = "This work is funded by the H2020 Programme of the
European Union under grant no. 688540 (the UniServer
Project).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "average DRAM power; Bandwidth; commodity server;
critical data; disabled hardware memory interleaving;
DRAM; DRAM chips; energy saving; Hardware;
heterogeneous-reliability DRAM framework;
heterogeneous-reliability memory;
heterogeneous-reliability memory framework; induced
performance loss; integrated circuit reliability;
interleaved storage; lowered memory supply voltage;
memory configurations; memory interleaving; Memory
management; Power efficiency; Random access memory;
reliability; Reliability; reliability; Reliability;
reliability; Resource management; Servers; Shimmer;
software-based interleaving; software-based memory
interleaving",
number-of-cited-references = "17",
ORCID-numbers = "Nikolopoulos, Dimitrios/0000-0003-0217-8307
Tovletoglou, Konstantinos/0000-0002-1513-3143",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Tovletoglou:2019:SIH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kumar:2019:HRA,
author = "Chanchal Kumar and Sidharth Singh and Gregory T.
Byrd",
title = "Hybrid Remote Access Protocol",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "30--33",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2896116",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The invalidation-based cache coherence protocols used
in current CMPs result in inefficient utilization of
cache hierarchy in the presence of heavy sharing, since
a significant percentage of shared cached data is
invalidated soon after it is brought into the private
cache. This work presents an analysis of a shared
memory cache coherence protocol; based on novel
insights from the analysis, we advocate direct remote
reads/writes at the shared last-level cache for heavily
contended data. Evaluation of our proposed protocol
with the Splash2x kernels shows 17 percent geometric
mean speedup over traditional MESI coherence and 8.5
percent better performance than prior remote-access
proposals.",
acknowledgement = ack-nhfb,
affiliation = "Kumar, C (Reprint Author), North Carolina State Univ,
Raleigh, NC 27695 USA. Kumar, Chanchal; Byrd, Gregory
T., North Carolina State Univ, Raleigh, NC 27695 USA.
Singh, Sidharth, North Carolina State Univ, Apple Inc,
Raleigh, NC 27695 USA.",
author-email = "ckumar2@ncsu.edu sssingh4@ncsu.edu gbyrd@ncsu.edu",
da = "2019-06-20",
doc-delivery-number = "HL5WL",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Access protocols; Benchmark testing; cache hierarchy;
cache storage; CMPs; Coherence; direct remote reads;
direct remote writes; geometric mean speedup; Hardware;
hybrid remote access protocol; invalidation-based cache
coherence protocols; Kernel; memory hierarchy; MESI
coherence; microprocessor chips; multi-core/single-chip
multiprocessors; Parallel architectures; private cache;
Proposals; protocols; shared cached data; shared
last-level cache; shared memory cache coherence
protocol; shared memory systems; Splash2x kernels",
number-of-cited-references = "10",
ORCID-numbers = "Byrd, Gregory/0000-0003-3647-8738",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kumar:2019:HRA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2019:DDD,
author = "Yicheng Wang and Yang Liu and Peiyun Wu and Zhao
Zhang",
title = "Detect {DRAM} Disturbance Error by Using Disturbance
Bin Counters",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "34--37",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2897299",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DRAM disturbance errors are increasingly a concern to
computer system reliability and security. There have
been a number of designs to detect and prevent them;
however, there lacks any design that guarantees 100
percent detection (no false negative) with a small and
fixed hardware cost. This paper presents such a design
based on a novel idea called disturbance bin counter
(DBC). Each DBC is a complex counter that maintains an
upper bound of disturbances for a bin of DRAM rows.
Their access is not in the critical path of processor
execution and thus incurs no performance overhead. The
design is optimized at the circuit level to minimize
the storage requirement. Our simulation results using
multi-core SPEC CPU2006 workloads show that no false
positive occurs with a 1,024-entry DBC table, which
requires only 4.5 KB storage. The design can be
incorporated into a memory controller to guarantee the
detection of DRAM disturbance errors or row hammering
by malicious programs.",
acknowledgement = ack-nhfb,
affiliation = "Wang, YC (Reprint Author), Univ Illinois, Chicago, IL
60607 USA. Wang, Yicheng; Liu, Yang; Wu, Peiyun; Zhang,
Zhao, Univ Illinois, Chicago, IL 60607 USA.",
author-email = "ywang271@uic.edu yliu327@uic.edu pwu27@uic.edu
zhangz@uic.edu",
da = "2019-06-20",
doc-delivery-number = "HL5WL",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[CCF-1618104, CCF-1643271]",
funding-text = "The authors appreciate the constructive comments from
the anonymous reviewers. This work is supported in part
by the US National Science Foundation under grants
CCF-1618104 and CCF-1643271.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "complex counter; Computer architecture; computer
system reliability; counting circuits; DBC table;
disturbance bin counter; DRAM; DRAM chips; DRAM
disturbance errors; DRAM rows; fixed hardware cost;
Hash functions; Indexes; malicious programs; memory
size 4.5 KByte; Microprocessors; Random access memory;
reliability; row-hammering; Transistors; Upper bound",
number-of-cited-references = "10",
ORCID-numbers = "Wu, Peiyun/0000-0001-5675-6454 Liu,
Yang/0000-0002-7377-1418 Wang,
Yicheng/0000-0003-1079-5591",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wang:2019:DDD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
xxpages = "35--38",
}
@Article{Xie:2019:NXB,
author = "Xinfeng Xie and Xing Hu and Peng Gu and Shuangchen Li
and Yu Ji and Yuan Xie",
title = "{NNBench-X}: Benchmarking and Understanding Neural
Network Workloads for Accelerator Designs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "38--42",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2898196",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The tremendous impact of deep learning algorithms over
a wide range of application domains has encouraged a
surge of neural network (NN) accelerator research. An
evolving benchmark suite and its associated benchmark
method are needed to incorporate emerging NN models and
characterize NN workloads. In this paper, we propose a
novel approach to understand the performance
characteristic of NN workloads for accelerator designs.
Our approach takes as input an application candidate
pool and conducts an operator-level analysis and
application-level analysis to understand the
performance characteristics of both basic tensor
primitives and whole applications. We conduct a case
study on the TensorFlow model zoo by using this
proposed characterization method. We find that tensor
operators with the same functionality can have very
different performance characteristics under different
input sizes, while operators with different
functionality can have similar characteristics.
Additionally, we observe that without operator-level
analysis, the application bottleneck is
mischaracterized for 15 out of 57 models from the
TensorFlow model zoo. Overall, our characterization
method helps users select representative applications
out of the large pool of possible applications, while
providing insightful guidelines for the design of NN
accelerators.",
acknowledgement = ack-nhfb,
affiliation = "Xie, XF (Reprint Author), Univ Calif Santa Barbara,
Santa Barbara, CA 93106 USA. Xie, Xinfeng; Hu, Xing;
Gu, Peng; Li, Shuangchen; Ji, Yu; Xie, Yuan, Univ Calif
Santa Barbara, Santa Barbara, CA 93106 USA.",
author-email = "xinfeng@ucsb.edu xinghu@ucsb.edu
peng\_gu@umail.ucsb.edu shuangchenli@ece.ucsb.edu
maple.jiyu@hotmail.com yuanxie@ucsb.edu",
da = "2019-06-20",
doc-delivery-number = "HQ4FG",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[1500848/172544/1730309]; CRISP--DARPA",
funding-text = "This work was supported in part by US National Science
Foundation 1500848/172544/1730309 and by CRISP, one of
six centers in JUMP, a Semiconductor Research
Corporation program sponsored by DARPA.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator designs; application-level analysis;
Artificial neural networks; benchmark; benchmark
method; benchmark testing; Benchmark testing;
characterization method; deep learning algorithms;
Feature extraction; Hardware; learning (artificial
intelligence); Measurement; neural nets; Neural
network; neural network accelerator research; neural
network workloads; NN accelerators; NN workloads;
NNBench-X; operator-level analysis; Parallel
processing; performance characteristic; tensor
operators; TensorFlow model zoo; workload
characterization",
number-of-cited-references = "22",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Xie:2019:NXB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khan:2019:RCA,
author = "Asif Ali Khan and Fazal Hameed and Robin Bl{\"a}sing
and Stuart Parkin and Jeronimo Castrillon",
title = "{RTSim}: a Cycle-Accurate Simulator for Racetrack
Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "43--46",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2899306",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Racetrack memories (RTMs) have drawn considerable
attention from computer architects of late. Owing to
the ultra-high capacity and comparable access latency
to SRAM, RTMs are promising candidates to revolutionize
the memory subsystem. In order to evaluate their
performance and suitability at various levels in the
memory hierarchy, it is crucial to have RTM-specific
simulation tools that accurately model their behavior
and enable exhaustive design space exploration. To this
end, we propose RTSim, an open source cycle-accurate
memory simulator that enables performance evaluation of
the domain-wall-based racetrack memories. The
skyrmions-based RTMs can also be modeled with RTSim
because they are architecturally similar to
domain-wall-based RTMs. RTSim is developed in
collaboration with physicists and computer scientists.
It accurately models RTM-specific shift operations,
access ports management and the sequence of memory
commands beside handling the routine read/write
operations. RTSim is built on top of NVMain2.0.
offering larger design space for exploration.",
acknowledgement = ack-nhfb,
affiliation = "Khan, AA (Reprint Author), Tech Univ Dresden, Chair
Compiler Construct, D-01069 Dresden, Germany. Khan,
Asif Ali; Hameed, Fazal; Castrillon, Jeronimo, Tech
Univ Dresden, Chair Compiler Construct, D-01069
Dresden, Germany. Blaesing, Robin; Parkin, Stuart, Max
Planck Inst Microstruct Phys Halle, D-06120 Halle,
Germany. Hameed, Fazal, Inst Space Technol, Islamabad
44000, Pakistan.",
author-email = "asif\_ali.khan@tu-dresden.de
fazal.hameed@tu-dresden.de blaesing@mpi-halle.mpg.de
stuart.parkin@mpi-halle.mpg.de
jeronimo.castrillon@tu-dresden.de",
da = "2019-06-20",
doc-delivery-number = "HQ4FG",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "German Research Council (DFG) through the
Cluster of Excellence `Center for Advancing Electronics
Dresden' (cfaed)",
funding-text = "This work was partially funded by the German Research
Council (DFG) through the Cluster of Excellence `Center
for Advancing Electronics Dresden' (cfaed).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache; comparable access latency; Computational
modeling; cycle-accurate simulator; design space
exploration; domain wall memory; domain-wall-based
racetrack memories; domain-wall-based RTM; emerging
memory technologies; Layout; main memory; memory
hierarchy; Memory management; Memory simulator; memory
subsystem; memory system; models RTM-specific shift
operations; Nonvolatile memory; NVM; open source
cycle-accurate memory simulator; racetrack memory;
Random access memory; random-access storage;
RTM-specific simulation tools; RTSim; scratchpad;
simulation; skyrmions-based RTM; Space exploration;
storage management; Tracking",
keywords-plus = "PERFORMANCE; MODEL; AREA",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Khan:2019:RCA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gan:2019:SSV,
author = "Yiming Gan and Yuxian Qiu and Jingwen Leng and Yuhao
Zhu",
title = "{SVSoC}: Speculative Vision Systems-on-a-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "47--50",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2903241",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Frame latency in continuous vision significantly
impacts the agility of intelligent machines that
interact with the environment via cameras. However,
today's continuous vision systems limit the frame
latency due to their fundamental sequential execution
model. We propose a speculative execution model along
with two mechanisms that enable practical vision
speculation. We present SVSOC, a new mobile
Systems-on-a-chip (SoC) architecture that augments
conventional mobile SoCs with the speculation
capability. Under the same energy budget, SVSOC
achieves 14.3 to 35.4 percent latency reduction in
different scenarios.",
acknowledgement = ack-nhfb,
affiliation = "Gan, YM (Reprint Author), Univ Rochester, Comp Sci,
601 Elmwood Ave, Rochester, NY 14627 USA. Gan, Yiming;
Zhu, Yuhao, Univ Rochester, Comp Sci, 601 Elmwood Ave,
Rochester, NY 14627 USA. Qiu, Yuxian, Shanghai Jiao
Tong Univ, Comp Sci, Shanghai 200240, Peoples R China.
Leng, Jingwen, Shanghai Jiao Tong Univ, Dept Comp Sci
\& Engn, Shanghai 200240, Peoples R China.",
author-email = "ygan10@ur.rochester.edu qiuyuxian@sjtu.edu.cn
leng-jw@sjtu.edu.cn yzhu@rochester.edu",
da = "2019-06-20",
doc-delivery-number = "HS8NK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; computer vision; Continuous
vision; continuous vision systems; control engineering
computing; fundamental sequential execution model;
Imaging; intelligent machines; IP networks;
microprocessor chips; mobile systems-on-a-chip
architecture; practical vision speculation; Predictive
models; Runtime; Sensors; speculation; speculation
capability; speculative execution model; speculative
vision systems-on-a-chip; SVSoC; system-on-chip;
systems-on-a-chip; Task analysis",
number-of-cited-references = "11",
ORCID-numbers = "Gan, Yiming/0000-0002-2033-5057",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Gan:2019:SSV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lin:2019:DSE,
author = "Ting-Ru Lin and Yunfan Li and Massoud Pedram and
Lizhong Chen",
title = "Design Space Exploration of Memory Controller
Placement in Throughput Processors with Deep Learning",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "51--54",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2905587",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As throughput-oriented processors incur a significant
number of data accesses, the placement of memory
controllers (MCs) has a critical impact on overall
performance. However, due to the lack of a systematic
way to explore the huge design space of MC placements,
only a few ad-hoc placements have been proposed,
leaving much of the opportunity unexploited. In this
paper, we present a novel deep-learning based framework
that explores this opportunity intelligently and
automatically. The proposed framework employs a genetic
algorithm to efficiently guide exploration through the
large design space while utilizing deep learning
methods to provide fast performance prediction of
design points instead of relying on slow full system
simulations. Evaluation shows that, the proposed deep
learning models achieves a speedup of 282X for the
search process, and the MC placement found by our
framework improves the average performance (IPC) of 18
benchmarks by 19.3 percent over the best-known
placement found by human intuition.",
acknowledgement = ack-nhfb,
affiliation = "Lin, TR (Reprint Author), Univ Southern Calif, Los
Angeles, CA 90007 USA. Lin, Ting-Ru; Pedram, Massoud,
Univ Southern Calif, Los Angeles, CA 90007 USA. Li,
Yunfan; Chen, Lizhong, Oregon State Univ, Corvallis, OR
97331 USA.",
author-email = "tingruli@usc.edu liyunf@oregonstate.edu pedram@usc.edu
chenliz@oregonstate.edu",
da = "2019-06-20",
doc-delivery-number = "HS8NK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [1566637,
1619456, 1619472, 1750047]; National Science Foundation
Software and Hardware Foundations",
funding-text = "We appreciate Shao-Hua Sun's assistance in DNN
development. This research is supported, in part, by
the National Science Foundation grants \#1566637,
\#1619456, \#1619472 and \#1750047, and Software and
Hardware Foundations.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ad-hoc placements; Benchmark testing; computer
architecture; Computer architecture; computer
architecture; Computer architecture; critical impact;
data accesses; Deep learning; deep learning; Deep
learning; deep learning; deep learning methods;
deep-learning based framework; design points; design
space; design space exploration; fast performance
prediction; genetic algorithm; genetic algorithms;
Interconnection networks; Kernel; MC placement; memory
architecture; memory controller placement; memory
controllers; neural nets; Program processors; search
problems; search process; Space exploration;
Throughput; throughput processors; throughput-oriented
processors",
keywords-plus = "GAME; GO",
number-of-cited-references = "10",
ORCID-numbers = "Lin, Ting-Ru/0000-0002-7272-4070 Chen,
Lizhong/0000-0001-5890-7121",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lin:2019:DSE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Arafa:2019:PGS,
author = "Yehia Arafa and Abdel-Hameed A. Badawy and Gopinath
Chennupati and Nandakishore Santhi and Stephan
Eidenbenz",
title = "{PPT--GPU}: Scalable {GPU} Performance Modeling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "55--58",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2904497",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Performance modeling is a challenging problem due to
the complexities of hardware architectures. In this
paper, we present PPT-GPU, a scalable and accurate
simulation framework that enables GPU code developers
and architects to predict the performance of
applications in a fast, and accurate manner on
different GPU architectures. PPT-GPU is part of the
open source project, Performance Prediction Toolkit
(PPT) developed at the Los Alamos National Laboratory.
We extend the old GPU model in PPT that predict the
runtimes of computational physics codes to offer better
prediction accuracy, for which, we add models for
different memory hierarchies found in GPUs and
latencies for different instructions. To further show
the utility of PPT-GPU, we compare our model against
real GPU device (s) and the widely used cycle-accurate
simulator, GPGPU-Sim using different workloads from
RODINIA and Parboil benchmarks. The results indicate
that the predicted performance of PPT-GPU is within a
10 percent error compared to the real device(s). In
addition, PPT-GPU is highly scalable, where it is up to
450x faster than GPGPU-Sim with more accurate
results.",
acknowledgement = ack-nhfb,
affiliation = "Arafa, Y (Reprint Author), New Mexico State Univ,
Klipsch Sch ECE, Las Cruces, NM 88003 USA. Arafa,
Yehia; Badawy, Abdel-Hameed A., New Mexico State Univ,
Klipsch Sch ECE, Las Cruces, NM 88003 USA. Badawy,
Abdel-Hameed A.; Chennupati, Gopinath; Santhi,
Nandakishore; Eidenbenz, Stephan, Los Alamos Natl Lab,
SM 30, Los Alamos, NM 87545 USA.",
author-email = "yarafa@nmsu.edu badawy@nmsu.edu gchennupati@lanl.gov
nsanthi@lanl.gov eidenben@lanl.gov",
da = "2019-06-20",
doc-delivery-number = "HU4EG",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "U.S. Department of Energy (DOE) National
Nuclear Security Administration (NNSA)
[DE-AC52-06NA25396]",
funding-text = "The authors would like to thank the anonymous
reviewers for their feedback which improved the quality
of the paper. We would also like to thank the members
of the PEARL laboratory at NMSU. Parts of this research
used resources provided at the Los Alamos National
Laboratory Institutional Computing Program, which is
supported through the U.S. Department of Energy (DOE)
National Nuclear Security Administration (NNSA) under
Contract No. DE-AC52-06NA25396. Computations were run
on Darwin, a research computing heterogeneous cluster.
Any opinions, findings, and/or conclusions expressed in
this paper do not necessarily represent the views of
the DOE or the U.S. Government.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architects; C language; Computational modeling;
Computer architecture; GPGPU; GPGPU-Sim; GPU
architectures; GPU device; GPU modeling; graphics
processing units; Graphics processing units; Kernel;
Los Alamos national laboratory; old GPU model; open
source project; parallel architectures; Parboil
benchmarks; performance evaluation; performance
prediction; performance prediction toolkit; power aware
computing; PPT; PPT-GPU; Predictive models; RODINIA;
Runtime; scalable GPU Performance modeling;
software/hardware co-design; Task analysis",
keywords-plus = "ROOFLINE",
number-of-cited-references = "22",
ORCID-numbers = "Badawy, Abdel-Hameed/0000-0001-8027-1449",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Arafa:2019:PGS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Denby:2019:OEC,
author = "Bradley Denby and Brandon Lucia",
title = "Orbital Edge Computing: Machine Inference in Space",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "59--62",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2907539",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Edge computing is an emerging paradigm aiding
responsiveness, reliability, and scalability of
terrestrial computing and sensing networks like
cellular and IoT. However, edge computing is largely
unexplored in high-datarate nanosatellite
constellations. Cubesats are small, energy-limited
sensors separated from the cloud by hundreds of
kilometers of atmosphere and space. As they
proliferate, centralized architectures impede advanced
applications. In this work, we define and characterize
Orbital Edge Computing. We describe power and software
optimizations for the orbital edge, and we use
formation flying to parallelize computation in space.",
acknowledgement = ack-nhfb,
affiliation = "Denby, B (Reprint Author), Carnegie Mellon Univ,
Pittsburgh, PA 15213 USA. Denby, Bradley; Lucia,
Brandon, Carnegie Mellon Univ, Pittsburgh, PA 15213
USA.",
author-email = "bdenby@andrew.cmu.edu blucia@andrew.cmu.edu",
da = "2019-06-20",
doc-delivery-number = "HU4EG",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Kavcic-Moura Endowment Fund; US National
Science Foundation CAREER Award [1751029]",
funding-text = "We thank the reviewers for the helpful feedback. This
work was generously funded by the Kavcic-Moura
Endowment Fund and US National Science Foundation
CAREER Award \#1751029.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "artificial satellites; Cameras; computer vision;
Cubesat; CubeSat; Downlink; edge computing;
high-datarate nanosatellite constellations; Internet of
Things; machine inference; orbital edge computing;
Orbits; paradigm aiding responsiveness; Pipeline
processing; remote sensing; satellite communication;
Sensors; telecommunication computing; telecommunication
network reliability; terrestrial computing; wireless
sensor networks",
number-of-cited-references = "39",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Denby:2019:OEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Liu:2019:UFT,
author = "He Liu and Jianhui Han and Youhui Zhang",
title = "A Unified Framework for Training, Mapping and
Simulation of {ReRAM}-Based Convolutional Neural
Network Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "63--66",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2908374",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "ReRAM-based neural network accelerators (RNAs) could
outshine their digital counterparts in terms of
computational efficiency and performance remarkably.
However, some open software tool for broad
architectural exploration and end-to-end evaluation are
still missing. We present a simulation framework of RNA
for CNN inference that encompasses a ReRAM-aware NN
training tool, a CNN-oriented mapper and a
micro-architecture simulator. Main characteristics of
ReRAM and circuits are reflected by the configurable
simulator, as well as by the customized training
algorithm. The function of the simulator's core
components is verified by the corresponding circuit
simulation of a real chip design. This framework
enables comprehensive architectural exploration and
end-to-end evaluation, and it's preliminary version is
available at https://github.com/CRAFT-THU/XB-Sim.",
acknowledgement = ack-nhfb,
affiliation = "Zhang, YH (Reprint Author), Tsinghua Univ, Dept Comp
Sci \& Technol, Beijing 100084, Peoples R China. Liu,
He; Zhang, Youhui, Tsinghua Univ, Dept Comp Sci \&
Technol, Beijing 100084, Peoples R China. Han, Jianhui,
Tsinghua Univ, Inst Microelect, Beijing 100084, Peoples
R China.",
author-email = "liuhe94@hotmail.com hanjh16@mails.tsinghua.edu.cn
zyh02@tsinghua.edu.cn",
da = "2019-06-20",
doc-delivery-number = "HU4EG",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Beijing Innovation Center for Future Chip;
Science and Technology Innovation Special Zone project,
China; HUAWEI project",
funding-text = "Thanks for the support from Beijing Innovation Center
for Future Chip, the support of the Science and
Technology Innovation Special Zone project, China, and
the support of HUAWEI project.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator; Artificial neural networks; CNN
inference; CNN-oriented mapper; computational
efficiency; Computational modeling; Computer
architecture; configurable simulator; convolutional
neural nets; customized training algorithm; Deep neural
network; digital counterparts; end-to-end evaluation;
Hardware; learning (artificial intelligence);
microarchitecture simulator; Microprocessors; open
software tool; processing-in-memory; ReRAM; ReRAM-aware
NN training tool; ReRAM-based convolutional neural
network acceleration; ReRAM-based neural network
accelerators; RNA; simulation; Training",
number-of-cited-references = "22",
ORCID-numbers = "Liu, He/0000-0002-9117-5265 Han,
Jianhui/0000-0002-8705-134X",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Liu:2019:UFT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tan:2019:DWO,
author = "Tian Tan and Eriko Nurvitadhi and Derek Chiou",
title = "Dark Wires and the Opportunities for Reconfigurable
Logic",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "67--70",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2909867",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power has become a fundamental limit to silicon
performance. Most research has focused on reducing
transistor switching to constrain power (dark silicon.)
Specialized accelerators have been proposed since they
implement functionality with fewer transistor switches
than general purpose cores. Increasing efficiency
requirements lead to more specialization and,
therefore, more accelerators that potentially leads to
longer distances to get to all the accelerators.
Communication, however, consumes energy, and therefore
needs to be minimized as well (dark wires.) This paper
examines the balance between compute and communication
specialization in the context of hard logic (e.g.,
ASIC) that is highly efficient but static versus soft
logic (e.g., FPGA) that is less efficient but allows
computation to be moved to reduce communication
distances. Our experimental results show using soft
accelerators consumes 0.6$ \times $-2.1$ \times $ total
power compared to using hard accelerators when
communication costs are taken into account.",
acknowledgement = ack-nhfb,
affiliation = "Tan, T (Reprint Author), Univ Texas Austin, Elect \&
Comp Engn, Austin, TX 78712 USA. Tan, Tian, Univ Texas
Austin, Elect \& Comp Engn, Austin, TX 78712 USA.
Nurvitadhi, Eriko, Intel Corp, Santa Clara, CA 95054
USA. Chiou, Derek, Univ Texas Austin, Austin, TX 78712
USA. Chiou, Derek, Microsoft, Austin, TX 78712 USA.",
author-email = "tan.tian@utexas.edu eriko.nurvitadhi@intel.com
derek@utexas.edu",
da = "2019-06-20",
doc-delivery-number = "HW7ZH",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Corporation, Hillsboro, OR",
funding-text = "Funding for this work was provided by Intel
Corporation, Hillsboro, OR. The authors would like to
thank the colleagues in the Accelerator Architecture
Lab at Intel Corporation, Hillsboro, OR and FAST
research group at the University of Texas at Austin,
Austin, TX for the discussion and feedback.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application specific integrated circuits; ASIC;
Benchmark testing; communication costs; communication
distances; communication specialization; dark silicon;
dark wires; efficiency requirements; elemental
semiconductors; energy efficient architecture; Field
programmable gate arrays; field programmable gate
arrays; FPGA; fundamental limit; general purpose cores;
geographical locality; hard logic; hardware
accelerator; Layout; low-power electronics;
reconfigurable logic; Silicon; silicon performance;
soft accelerators; Specialized accelerators; static
versus soft logic; Throughput; transistor circuits;
transistor switches; transistor switching; Transistors;
wires; Wires",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Tan:2019:DWO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Naithani:2019:PRE,
author = "Ajeya Naithani and Josue Feliu and Almutaz Adileh and
Lieven Eeckhout",
title = "Precise Runahead Execution",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "71--74",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2910518",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Runahead execution improves processor performance by
accurately prefetching long-latency memory accesses.
When a long-latency load causes the instruction window
to fill up and halt the pipeline, the processor enters
runahead mode and keeps speculatively executing code to
trigger accurate prefetches. A recent improvement
tracks the chain of instructions that leads to the
long-latency load, stores it in a runahead buffer, and
executes only this chain during runahead execution,
with the purpose of generating more prefetch requests
during runahead execution. Unfortunately, all these
prior runahead proposals have shortcomings that limit
performance and energy efficiency because they discard
the full instruction window to enter runahead mode and
then flush the pipeline to restart normal operation.
This significantly constrains the performance benefits
and increases the energy overhead of runahead
execution. In addition, runahead buffer limits prefetch
coverage by tracking only a single chain of
instructions that lead to the same long-latency load.
We propose precise runahead execution (PRE) to mitigate
the shortcomings of prior work. PRE leverages the
renaming unit to track all the dependency chains
leading to long-latency loads. PRE uses a novel
approach to manage free processor resources to execute
the detected instruction chains in runahead mode
without flushing the pipeline. Our results show that
PRE achieves an additional 21.1 percent performance
improvement over the recent runahead proposals while
reducing energy consumption by 6.1 percent.",
acknowledgement = ack-nhfb,
affiliation = "Naithani, A (Reprint Author), Univ Ghent, B-9000
Ghent, Belgium. Naithani, Ajeya; Adileh, Almutaz;
Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium.
Feliu, Josue, Univ Politecn Valencia, Valencia 46010,
Spain.",
author-email = "ajeya.naithani@ugent.be jofepre@gap.upv.es
almutaz.adileh@ugent.be lieven.eeckhout@ugent.be",
da = "2019-06-20",
doc-delivery-number = "HW9SJ",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "FWO [G.0434.16N, G.0144.17N]; European
Research Council (ERC) [741097]",
funding-text = "This research is supported through FWO grants no.
G.0434.16N and G.0144.17N, and European Research
Council (ERC) Advanced Grant agreement no. 741097.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Buffer storage; dependency chains; energy efficiency;
instruction window; long-latency load; long-latency
memory accesses; Microarchitecture; Microsoft Windows;
Out of order; pipeline processing; Pipelines; power
aware computing; precise runahead execution; prefetch
requests; Prefetching; Proposals; Registers; runahead
buffer limits; runahead execution; single-core
performance; storage management",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Naithani:2019:PRE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Agrawal:2019:MPS,
author = "V. Agrawal and M. A. Dinani and Y. Shui and M. Ferdman
and N. Honarmand",
title = "Massively Parallel Server Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "75--78",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2911287",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern data centers enjoy massive degrees of
request-level parallelism with significant
cross-request similarity. Although similar requests
follow similar instruction sequences, conventional
processors service them individually and do not take
full advantage of cross-request similarity.
Single-Instruction Multiple-Thread (SIMT) architectures
can leverage this similarity, however, existing SIMT
processors chief among them, GPUs are ill-suited for
server applications, as they are specifically designed
to maximize throughput at the expense of latency,
preventing them from meeting server QoS requirements.
We advocate a new approach to SIMT server processors,
namely Massively Parallel Server Processors (MPSPs),
which we outline in this paper. To begin to understand
their architectural needs, we measure the degree of
control-flow and memory-access divergence encountered
when running unmodified server applications on
MPSP-style processors. Our preliminary results indicate
that a software scheduler that bundles together similar
requests can minimize control-flow divergence, making
SIMT execution of unmodified server code feasible.
Moreover, we find that memory-access divergence,
although significant in raw numbers, can be tackled
with changes in stack and heap layouts. Overall, our
results encourage further consideration of MPSPs as a
promising architecture for server processors.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; data centers; Instruction sets;
Message systems; Parallel processing; Quality of
service; servers; Servers; Single Instruction Multiple
Thread",
}
@Article{Golestani:2019:PMB,
author = "H. Golestani and G. Gupta and R. Sen",
title = "Performance Modeling and Bottleneck Analysis of {EDGE}
Processors Using Dependence Graphs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "79--82",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2911514",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Exploring new directions in ISA and microarchitecture
design can be challenging due to the large search
space. Efficient tools and methods are needed to
quickly identify rewarding design choices. In this
work, we develop a graph-based framework that
effectively models complex architectures and enables
efficient analysis of their performance and
bottlenecks. We use this framework to investigate
proposals for EDGE (Explicit Data Graph Execution) ISA,
a new class of ISA in which programs are composed from
atomic blocks, each of which explicitly exposes
dataflow to hardware. We study the impact of two
important EDGE-specific design choices: block formats
and operand-movement instructions. We demonstrate how
this analysis leads to insights in EDGE
architectures.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Bottleneck analysis; Data models;
EDGE; EDGE (Explicit Data Graph Execution); Hardware;
Hazards; ISA; Load modeling; Microarchitecture;
microarchitecture; Microarchitecture;
microarchitecture; performance modeling; Program
processors",
}
@Article{Leng:2019:ARA,
author = "J. Leng and A. Buyuktosunoglu and R. Bertran and P.
Bose and V. J. Reddi",
title = "Asymmetric Resilience for Accelerator-Rich Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "1",
pages = "83--86",
month = jan # "\slash " # jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2019.2917898",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Accelerators are becoming popular owing to their
exceptional performance and power-efficiency. However,
researchers are yet to pay close attention to their
reliability a key challenge as technology scaling makes
building reliable systems challenging. A
straightforward solution to make accelerators reliable
is to design the accelerator from the ground-up to be
reliable by itself. However, such a myopic view of the
system, where each accelerator is designed in
isolation, is unsustainable as the number of integrated
accelerators continues to rise in SoCs. To address this
challenge, we propose a paradigm called asymmetric
resilience that avoids accelerator-specific reliability
design. Instead, its core principle is to develop the
reliable heterogeneous system around the CPU
architecture. We explain the implications of
architecting such a system and the modifications needed
in a heterogeneous system to adopt such an approach. As
an example, we demonstrate how to use asymmetric
resilience to handle GPU execution errors using the CPU
with minimal overhead. The general principles can be
extended to include other accelerators.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator architecture; error recovery; Graphics
processing units; heterogeneous system; Kernel; Memory
management; Reliability; Resilience; Runtime; soft
errors; Task analysis; voltage noise",
}
@Article{Sadredini:2019:SEM,
author = "E. Sadredini and R. Rahimi and V. Verma and M. Stan
and K. Skadron",
title = "A Scalable and Efficient In-Memory Interconnect
Architecture for Automata Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "87--90",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2909870",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Accelerating finite automata processing benefits
regular-expression workloads and a wide range of other
applications that do not map obviously to regular
expressions, including pattern mining, bioinfomatics,
and machine learning. Existing in-memory automata
processing accelerators suffer from inefficient routing
architectures. They are either incapable of efficiently
place-and-route a highly connected automaton or require
an excessive amount of hardware resources. In this
paper, we propose a compact, low-overhead, and yet
flexible in-memory interconnect architecture that
efficiently implements routing for next-state
activation, and can be applied to the existing
in-memory automata processing architectures. We use
SRAM 8T subarrays to evaluate our interconnect.
Compared to the Cache Automaton routing design, our
interconnect reduces the number of switches $ 7 \times
$, therefore, reduces area overhead for the
interconnect. It also has faster row cycle time because
of shorter wires and consumes less power.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Automata; automata processing; bioinfomatics; cache
automaton routing design; connected automaton; finite
automata; finite automata processing; Hardware;
hardware resources; in-memory automata; in-memory
automata processing accelerators; in-memory
interconnect architecture; Indexes; inefficient routing
architectures; integrated circuit interconnections;
Interconnect; machine learning; memory architecture;
Memory management; next-state activation; pattern
mining; processing in memory; Random access memory;
regular expression workloads; Routing; SRAM 8T
subarrays; SRAM chips",
}
@Article{Yasin:2019:TPM,
author = "A. Yasin and A. Mendelson and Y. Ben-Asher",
title = "Tuning Performance via Metrics with Expectations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "91--94",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2916408",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern server systems employ many features that are
difficult to exploit by software developers. This paper
calls for a new performance optimization approach that
uses designated metrics with expected optimal values. A
key insight is that expected values of these metrics
are essential in order to verify that no performance is
wasted during incremental utilization of processor
features. We define sample primary metrics for modern
architectures and present three distinct techniques
that help to determine their optimal values. Our
preliminary results successfully provide 2x-4x extra
speedup during tuning of commonly-used software
optimizations on the matrix-multiply kernel.
Additionally, our approach helped to identify
counter-intuitive causes that hurt multicore
scalability of an optimized deep-learning benchmark on
a Cascade Lake server.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cascade Lake server; Code tuning; counter-intuitive
cause identification; expectations; expected optimal
values; incremental utilization; Kernel; learning
(artificial intelligence); matrix multiplication;
matrix-multiply kernel; Measurement; measurements;
micro-architecture; microprocessor chips; modern server
systems; multi-core/single-chip multiprocessors;
Multicore processing; multiprocessing systems;
Optimization; optimization; optimized deep-learning
benchmark; performance analysis; performance
evaluation; performance optimization approach;
processor features; sample primary metrics; Servers;
SIMD processors; software metrics; software
optimizations; Tuning; tuning performance",
}
@Article{Wang:2019:MEM,
author = "L. Wang and M. Jahre and A. Adileh and Z. Wang and L.
Eeckhout",
title = "Modeling Emerging Memory-Divergent {GPU}
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "95--98",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2923618",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/pvm.bib",
abstract = "Analytical performance models yield valuable
architectural insight without incurring the excessive
runtime overheads of simulation. In this work, we study
contemporary GPU applications and find that the key
performance-related behavior of such applications is
distinct from traditional GPU applications. The key
issue is that these GPU applications are
memory-intensive and have poor spatial locality, which
implies that the loads of different threads commonly
access different cache blocks. Such memory-divergent
applications quickly exhaust the number of misses the
L1 cache can process concurrently, and thereby cripple
the GPU's ability to use Memory-Level Parallelism (MLP)
and Thread-Level Parallelism (TLP) to hide memory
latencies. Our Memory Divergence Model (MDM) is able to
accurately represent this behavior and thereby reduces
average performance prediction error by $ 14 \times $
compared to the state-of-the-art GPUMech approach
across our memory-divergent applications.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; analytical performance models;
Analytical performance prediction; average performance
prediction error; cache blocks; cache storage;
Computational modeling; contemporary GPU applications;
GPU; graphics processing units; Graphics processing
units; Instruction sets; key performance-related
behavior; L1 cache; Mathematical model; memory
architecture; memory divergence model; memory
latencies; memory-divergent applications;
memory-divergent GPU applications; memory-intensive;
memory-level parallelism; multi-threading;
multiprocessing systems; Predictive models; Random
access memory; thread-level parallelism; traditional
GPU applications; valuable architectural insight",
}
@Article{Shomron:2019:SSS,
author = "G. Shomron and T. Horowitz and U. Weiser",
title = "{SMT-SA}: Simultaneous Multithreading in Systolic
Arrays",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "99--102",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2924007",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Systolic arrays (SAs) are highly parallel pipelined
structures capable of executing various tasks such as
matrix multiplication and convolution. They comprise a
grid of usually homogeneous processing units (PUs) that
are responsible for the multiply-accumulate (MAC)
operations in the case of matrix multiplication. It is
not rare for a PU input to be zero-valued, in which
case the PU becomes idle and the array becomes
underutilized. In this paper we consider a solution to
employ the underutilized PUs via simultaneous
multithreading (SMT). We explore the design space of a
SMT-SA variant and evaluate its performance, area
efficiency, and energy consumption. In addition, we
suggest a tiling method to reduce area overheads. Our
evaluation shows that a 4-thread FP16-based SMT-SA
achieves speedups of up to $ 3.6 \times $ as compared
to conventional SA, with $ 1.7 \times $ area overhead
and negligible energy overhead.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "4-thread FP16-based SMT-SA; area efficiency;
Convolution; Correlation; Deep learning; Energy
consumption; energy consumption; homogeneous processing
units; Instruction sets; matrix multiplication;
multi-threading; multiply-accumulate operations;
Multithreading; multithreading; parallel pipelined
structures; PU input; simultaneous multithreading;
SMT-SA variant; Systolic arrays; systolic arrays; Task
analysis",
}
@Article{Masouros:2019:RRS,
author = "D. Masouros and S. Xydis and D. Soudris",
title = "{Rusty}: Runtime System Predictability Leveraging
{LSTM} Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "103--106",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2924622",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern cloud scale data-centers are adopting workload
co-location as an effective mechanism for improving
resource utilization. However, workload co-location is
stressing resource availability in unconventional and
unpredictable manner. Efficient resource management
requires continuous and ideally predictive runtime
knowledge of system metrics, sensitive both to workload
demands, e.g., CPU, memory etc., as well as
interference effects induced by co-location. In this
paper, we present Rusty, a framework able to address
the aforementioned challenges by leveraging the power
of Long Short-Term Memory networks to forecast at
runtime, performance metrics of applications executed
on systems under interference. We evaluate Rusty under
a diverse set of interference scenarios for a plethora
of cloud workloads, showing that Rusty achieves
extremely high prediction accuracy, up to 0.99 in terms
of R2 value, satisfying at the same time the strict
latency constraints to be usable at runtime.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; cloud computing; cloud workloads;
computer centres; Correlation; datacenters; extremely
high prediction accuracy; interference; Interference;
interference effects; interference scenarios; long
short-term memory networks; LSTM neural networks;
Measurement; modern cloud scale data-centers;
Monitoring; recurrent neural nets; resource allocation;
resource availability; Resource management; resource
management; resource utilization; Run-time system
predictability; Runtime; runtime knowledge; runtime
system predictability leveraging LSTM neural networks;
Rusty; system metrics; unconventional manner; workload
co-location",
}
@Article{Kim:2019:THA,
author = "S. Kim and H. Jung and W. Shin and H. Lee and H. Lee",
title = "{HAD-TWL}: Hot Address Detection-Based Wear Leveling
for Phase-Change Memory Systems with Low Latency",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "107--110",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2929393",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Phase-change memory (PCM) is an emerging non-volatile
memory device that offers faster access than flash
memory does. However, PCM suffers from a critical
problem where the number of write operations is
limited. The previous practical attack detector (PAD)
that uses a small memory space called stack adopts an
algebraic mapping-based wear leveling (AWL) algorithm.
Thanks to successful detection of malicious attacks,
the PAD-AWL dramatically improves the lifetime of PCM.
To enhance system factors such as write latency, the
proposed method replaces the AWL algorithm with a
table-based wear leveling (TWL) algorithm. Since the
fixed stack size of the previous PAD is inefficient in
detection of attack-like hot addresses, a stack size
modulation scheme that enables a hot address detector
(HAD) to efficiently counteract various memory write
streams is proposed. Compared with the previous
AWL-based algorithm, the integration with the TWL
algorithm demands only 24 percent of the total number
of swaps per write, and the proposed HAD with the stack
size modulation scheme achieves the detection rate of
94 percent while reducing the execution time by 57
percent.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "algebraic mapping-based wear leveling algorithm;
attack-like hot addresses; AWL-based algorithm;
detection rate; Detectors; embedded memory management
system; emerging nonvolatile memory device; endurance;
fixed stack size; flash memories; flash memory;
HAD-TWL; Hardware; hot address detection-based wear
leveling; hot address detector; malicious attacks;
Memory management; memory space; PAD-AWL; PCM; Phase
change materials; phase change memories; Phase-change
memory; phase-change memory systems; practical attack
detector; Pulse modulation; Random access memory; stack
size modulation scheme; system factors; table-based
wear leveling algorithm; TWL algorithm; wear; wear
leveling; write operations",
}
@Article{Zhou:2019:QCD,
author = "H. Zhou and G. T. Byrd",
title = "Quantum Circuits for Dynamic Runtime Assertions in
Quantum Computation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "111--114",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2935049",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we propose quantum circuits for runtime
assertions, which can be used for both software
debugging and error detection. Runtime assertion is
challenging in quantum computing for two key reasons.
First, a quantum bit (qubit) cannot be copied, which is
known as the non-cloning theorem. Second, when a qubit
is measured, its superposition state collapses into a
classical state, losing the inherent parallel
information. In this paper, we overcome these
challenges with runtime computation through ancilla
qubits, which are used to indirectly collect the
information of the qubits of interest. We design
quantum circuits to assert classical states,
entanglement, and superposition states.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ancilla qubits; assertions; classical state;
Debugging; debugging; dynamic runtime assertions; error
detection; inherent parallel information; Logic gates;
Measurement uncertainty; noncloning theorem; program
debugging; quantum bit; quantum circuits; quantum
circuits design; quantum computation; quantum
computing; Quantum computing; quantum entanglement;
Quantum entanglement; quantum error detection; Qubit;
qubit; Runtime; runtime assertion; runtime computation;
software debugging; superposition state",
}
@Article{Rao:2019:ATC,
author = "J. Rao and T. Ao and K. Dai and X. Zou",
title = "{ARCE}: Towards Code Pointer Integrity on Embedded
Processors Using Architecture-Assisted Run-Time
Metadata Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "115--118",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2935445",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Code Pointer Integrity (CPI) is an efficient control
flow protection technique focusing on sensitive code
pointers with a formal proof of security, but it relies
on software lookup tables or Memory Management Unit
(MMU) based address translation and instruction-level
memory isolation which are impractical for
resource-constrained embedded processors. This paper
enables Architecture-assisted Run-time CPI on Embedded
Processors (ARCE) with 2-level metadata to balance
security, performance and resource overhead. The first
level 2-bit property metadata colors data into
different domains and the second level boundary
metadata holds structure constraints for indirect code
pointers only. With memory and instruction extensions,
metadata shares the address space with program data and
is propagated at runtime to maintain a precise set of
sensitive code pointers. It lazily validates the
content and boundary of sensitive pointers at
dereference stage to eliminate false alarms. We
implemented ARCE based on a shallow 3-stage pipeline
processor Z-scale and validated its security
effectiveness with code pointer attack vectors in RIPE.
It introduces less than 1 percent performance overhead
for benchmarks in C with 7.33 percent logic and 6.25
percent memory overhead. ARCE eliminates address space
waste and dependency on advanced hardware which makes
CPI practical even for systems with bare metal
applications.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ARCE; architecture-assisted run-time CPI on embedded
processors; architecture-assisted run-time metadata
management; code pointer attack vectors; code pointer
integrity; Code pointer integrity; control flow
protection technique; data flow analysis; embedded
processors; embedded systems; first level 2-bit
property metadata colors data; Hardware; indirect code
pointers; instruction extensions; instruction set
extensions; instruction sets; instruction-level memory
isolation; Integrated circuits; level boundary
metadata; Memory management; memory management unit
based address translation; meta data; Metadata;
microprocessor chips; MMU; multi-level metadata;
pipeline processing; Program processors; Registers;
resource-constrained embedded processors; RIPE;
security; Security; security of data; sensitive code
pointers; shallow 3-stage pipeline processor Z-scale;
software lookup tables; storage management; table
lookup",
}
@Article{Bhardwaj:2019:DOC,
author = "K. Bhardwaj and M. Havasi and Y. Yao and D. M. Brooks
and J. M. H. Lobato and G. Wei",
title = "Determining Optimal Coherency Interface for
Many-Accelerator {SoCs} Using {Bayesian} Optimization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "119--123",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2910521",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The modern system-on-chip (SoC) of the current
exascale computing era is complex. These SoCs not only
consist of several general-purpose processing cores but
also integrate many specialized hardware accelerators.
Three common coherency interfaces are used to integrate
the accelerators with the memory hierarchy:
non-coherent,coherent with the last-level cache (LLC),
and fully-coherent.However, using a single coherence
interface for all the accelerators in an SoC can lead
to significant overheads: in the non-coherent model,
accelerators directly access the main memory, which can
have considerable performance penalty; whereas in the
LLC-coherent model, the accelerators access the LLC but
may suffer from performance bottleneck due to
contention between several accelerators; and the
fully-coherent model, that relies on private caches,
can incur non-trivial power/area overheads. Given the
limitations of each of these interfaces, this paper
proposes a novel performance-aware hybrid coherency
interface, where different accelerators use different
coherency models, decided at design time based on the
target applications so as to optimize the overall
system performance. A new Bayesian optimization based
framework is also proposed to determine the optimal
hybrid coherency interface, i.e., use machine learning
to select the best coherency model for each of the
accelerators in the SoC in terms of performance. For
image processing and classification workloads, the
proposed framework determined that a hybrid interface
achieves up to 23 percent better performance compared
to the other homogeneous interfaces, where all the
accelerators use a single coherency model.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bayes methods; Bayesian optimization; Coherence;
coherence protocols; Computational modeling; Hardware;
hardware accelerators; Optimization; Program
processors; Protocols; System-on-chip (SoC)",
}
@Article{Ansari:2019:CLO,
author = "Ali Ansari and Pejman Lotfi-Kamran and Hamid
Sarbazi-Azad",
title = "Code Layout Optimization for Near-Ideal Instruction
Cache",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "124--127",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2924429",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Instruction cache misses are a significant source of
performance degradation in server workloads because of
their large instruction footprints and complex control
flow. Due to the importance of reducing the number of
instruction cache misses, there has been a myriad of
proposals for hardware instruction prefetchers in the
past two decades. While effectual, state-of-the-art
hardware instruction prefetchers either impose
considerable storage overhead or require significant
changes in the frontend of a processor. Unlike hardware
instruction prefetchers, code-layout optimization
techniques profile a program and then reorder the code
layout of the program to increase spatial locality, and
hence, reduce the number of instruction cache misses.
While an active area of research in the 1990s,
code-layout optimization techniques have largely been
neglected in the past decade. We evaluate the
suitability of code-layout optimization techniques for
modern server workloads and show that if we combine
these techniques with a simple next-line prefetcher,
they can significantly reduce the number of instruction
cache misses. Moreover, we propose a new code-layout
optimization algorithm and show that along with a
next-line prefetcher, it offers the same performance
improvement as the state-of-the-art hardware
instruction prefetcher, but with almost no hardware
overhead.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "basic-block reordering; Cache storage; code-layout
optimization; Encoding; Instruction cache miss;
instruction prefetcher; Instruction sets; Optimization;
Prefetching",
}
@Article{Ranganath:2019:SCC,
author = "Kiran Ranganath and AmirAli Abdolrashidi and Shuaiwen
Leon Song and Daniel Wong",
title = "Speeding up Collective Communications Through
Inter-{GPU} Re-Routing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "128--131",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2933842",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In order to address the vast needs of disparate
domains, computing engines are becoming more
sophisticated and complex. A typical high-performance
computational engine is composed of several accelerator
units, in most cases GPUs, plus one or more CPU
controllers. All these components are becoming
increasingly interconnected to satisfy bandwidth and
latency tolerance demands from modern workloads. Due to
these constraints, solutions to efficiently
interconnect them or to systematically manage their
traffic-such as PCIe v3, NVLink v1 and v2 on the
hardware side, and NVIDIA Collective Communication
Library (NCCL) and AMD ROCM layer on the software
side-are becoming more commonplace inside HPC systems
and cloud data centers. However, as the number of
accelerators increases, workloads (especially machine
learning) might not be able to fully exploit the
computational substrate due to inefficient use of
hardware interconnects. Such scenarios can lead to
performance bottlenecks where high-bandwidth links are
not used by the underlying libraries and
under-performing links are overused. This work proposes
Workload Optimization Through Inter-GPU Re-routing
(WOTIR), which consists of enhanced NCCL-based
collective primitives that aim to boost bandwidth
utilization (through more efficient routing) and reduce
communication overhead. WOTIR targets GPUs with no
direct NVLink communication path (which leads to PCIe
communications) and instead re-routes communication
through intermediate GPUs to bridge NVLink segments and
avoid PCIe communications. Such method allows the
maximum possible utilization of the NVLink bandwidth
between the GPUs without routing through the PCIe bus.
Using this method, we see a reduction of up to 34
percent in execution time for selected machine learning
workloads when non-optimal GPU allocations arise.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Collective communication; GPU; Graphics
processing units; interconnect; Interference; Machine
learning; Routing; Servers; Training data",
}
@Article{Stow:2019:PPM,
author = "Dylan Stow and Amin Farmahini-Farahani and Sudhanva
Gurumurthi and Michael Ignatowski and Yuan Xie",
title = "Power Profiling of Modern Die-Stacked Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "132--135",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2941715",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Die-stacked memories that integrate multiple DRAM dies
into the processor package have reduced the interface
bottleneck and improved efficiency, but demands for
memory capacity and bandwidth remain unfulfilled.
Additionally, the introduction of memory into the
package further complicates heat removal. Memory power
is therefore becoming a key architectural concern. To
provide insight into these challenges, an architectural
power model for High Bandwidth Memory is developed,
validated, and used to provide detailed power profiles.
Based on the resulting power trends, power is projected
for potential future memory configurations with
increased bandwidth and capacity. The results suggest
that, without significant improvements in memory
technology or architecture, the power utilization of
in-package memories will continue to grow and limit the
system power budget.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Integrated circuits; Memory management; Power
measurement; Power system measurement; Random access
memory; random access memory; Three-dimensional
displays; three-dimensional integrated circuits",
}
@Article{Nabavinejad:2019:CDP,
author = "Seyed Morteza Nabavinejad and Hassan Hafez-Kolahi and
Sherief Reda",
title = "Coordinated {DVFS} and Precision Control for Deep
Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "136--140",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2942020",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Traditionally, DVFS has been the main mechanism to
trade-off performance and power. We observe that Deep
Neural Network (DNN) applications offer the possibility
to trade-off performance, power, and accuracy using
both DVFS and numerical precision levels. Our proposed
approach, Power-Inference accuracy Trading (PIT),
monitors the server's load, and accordingly adjusts the
precision of the DNN model and the DVFS setting of GPU
to trade-off the accuracy and power consumption with
response time. At high loads and tight request
arrivals, PIT leverages INT8-precision instructions of
GPU to dynamically change the precision of deployed DNN
models and boosts GPU frequency to execute the requests
faster at the expense of accuracy reduction and high
power consumption. However, when the requests' arrival
rate is relaxed and there is slack time for requests,
PIT deploys high precision version of models to improve
the accuracy and reduces GPU frequency to decrease
power consumption. We implement and deploy PIT on a
state-of-the-art server equipped with a Tesla P40 GPU.
Experimental results demonstrate that depending on the
load, PIT can improve response time up to 11 percent
compared to a job scheduler that uses only FP32
precision. It also improves the energy consumption by
up to 28 percent, while achieving around 99.5 percent
accuracy of sole FP32-precision.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accuracy; Deep neural network; Graphics processing
units; hardware accelerator; Neural networks; power;
Power demand; response time; Runtime; Servers; Time
factors; Time-frequency analysis",
}
@Article{Lee:2019:ELM,
author = "Seunghak Lee and Nam Sung Kim and Daehoon Kim",
title = "Exploiting {OS}-Level Memory Offlining for {DRAM}
Power Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "141--144",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2942914",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power and energy consumed by main memory systems in
data-center servers have increased as the DRAM capacity
and bandwidth increase. Particularly, background power
accounts for a considerable fraction of the total DRAM
power consumption; the fraction will increase further
in the near future, especially when slowing-down
technology scaling forces us to provide necessary DRAM
capacity through plugging in more DRAM modules or
stacking more DRAM chips in a DRAM package. Although
current DRAM architecture supports low power states at
rank granularity that turn off some components during
idle periods, techniques to exploit memory-level
parallelism make the rank-granularity power state
become ineffective. Furthermore, the long wake-up
latency is one of obstacles to adopting aggressive
power management (PM) with deep power-down states. By
tackling the limitations, we propose OffDIMM that is a
software-assisted DRAM PM collaborating with the
OS-level memory onlining/offlining. OffDIMM maps a
memory block in the address space of the OS to a
subarray group or groups of DRAM, and sets a deep
power-down state for the subarray group when offlining
the block. Through the dynamic OS-level memory
onlining/offlining based on the current memory usage,
our experimental results show OffDIMM reduces
background power by 24 percent on average without
notable performance overheads.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "DRAM; Energy consumption; Hardware; Linux; Memory
management; memory offlining; power management; Power
system management; Random access memory",
}
@Article{Marinakis:2019:PFI,
author = "Theodoros Marinakis and Iraklis Anagnostopoulos",
title = "Performance and Fairness Improvement on {CMPs}
Considering Bandwidth and Cache Utilization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "1--4",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2944810",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Chip multiprocessors (CMPs) have become dominant both
in server and embedded domain as they accommodate an
increasing amount of cores in order to satisfy the
workload demands. However, when applications run
concurrently, they compete for shared resources, such
as Last Level Cache (LLC) and main memory bandwidth.
Applications are affected in various ways by
contention, and uneven degradation makes the system
unreliable and the overall performance unpredictable.
The goal of this work is to improve performance by
sophisticated grouping that balances bandwidth and LLC
requirements, while at the same time providing a fair
execution environment by prioritizing applications that
experience the least accumulated progress. The proposed
scheduler achieves an average performance gain of 16
percent over the Linux scheduler and 6.3 percent over
another performance-oriented scheduler. Additionally,
it keeps unfairness very close to two fairness-oriented
schedulers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Chip multiprocessors; contention-aware
scheduling; Degradation; fairness; Interference; Job
shop scheduling; Linux; performance; Quality of
service; Resource management",
}
@Article{Balaji:2019:FEW,
author = "Adarsha Balaji and Shihao Song and Anup Das and Nikil
Dutt and Jeff Krichmar and Nagarajan Kandasamy and
Francky Catthoor",
title = "A Framework to Explore Workload-Specific Performance
and Lifetime Trade-offs in Neuromorphic Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "149--152",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2951507",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Neuromorphic hardware with non-volatile memory (NVM)
can implement machine learning workload in an
energy-efficient manner. Unfortunately, certain NVMs
such as phase change memory (PCM) require high voltages
for correct operation. These voltages are supplied from
an on-chip charge pump. If the charge pump is activated
too frequently, its internal CMOS devices do not
recover from stress, accelerating their aging and
leading to negative bias temperature instability (NBTI)
generated defects. Forcefully discharging the stressed
charge pump can lower the aging rate of its CMOS
devices, but makes the neuromorphic hardware
unavailable to perform computations while its charge
pump is being discharged. This negatively impacts
performance such as latency and accuracy of the machine
learning workload being executed. In this letter, we
propose a novel framework to exploit workload-specific
performance and lifetime trade-offs in neuromorphic
computing. Our framework first extracts the precise
times at which a charge pump in the hardware is
activated to support neural computations within a
workload. This timing information is then used with a
characterized NBTI reliability model to estimate the
charge pump's aging during the workload execution. We
use our framework to evaluate workload-specific
performance and reliability impacts of using (1)
different SNN mapping strategies and (2) different
charge pump discharge strategies. We show that our
framework can be used by system designers to explore
performance and reliability trade-offs early in the
design of neuromorphic hardware such that appropriate
reliability-oriented design margins can be set.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Aging; and inter-spike interval (ISI); Charge pumps;
Negative bias temperature instability; negative bias
temperature instability (NBTI); Neuromorphic computing;
Neuromorphics; non-volatile memory (NVM); phase-change
memory (PCM); spiking neural networks (SNNs); Synapses;
Thermal variables control; wear-out",
}
@Article{Jeon:2019:LAG,
author = "Hyeran Jeon and Hodjat Asghari Esfeden and Nael B.
Abu-Ghazaleh and Daniel Wong and Sindhuja Elango",
title = "Locality-Aware {GPU} Register File",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "153--156",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2959298",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In many emerging applications such as deep learning,
large data set is essential to generate reliable
solutions. In these big data workloads, memory latency
and bandwidth are the main performance bottlenecks. In
this article, we propose a locality-aware GPU register
file that enables data sharing for memory-intensive big
data workloads on GPUs without relying on small on-chip
memories. We exploit two types of data sharing patterns
commonly found from the big data workloads and have
warps opportunistically share data in physical
registers instead of issuing memory loads separately
and storing the same data redundantly in their
registers as well as small shared memory. With an
extended register file mapping mechanism, our proposed
design enables warps to share data by simply mapping to
the same physical registers or reconstructing from the
data in the register file already. The proposed sharing
not only reduces the memory transactions but also
further decreases the register file usage. The spared
registers make rooms for applying orthogonal
optimizations for energy and performance improvement.
Our evaluation on two deep learning workloads and
matrixMul show that the proposed locality-aware GPU
register file achieves over $ 2 \times $ speedup and
saves register space up to 57 percent.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Big Data; convolution neural network; Deep
learning; GPU; Graphics processing units; Matrix
operations; Registers; System-on-chip",
}
@Article{Li:2019:PBP,
author = "Chen Li and Yifan Sun and Lingling Jin and Lingjie Xu
and Zheng Cao and Pengfei Fan and David Kaeli and Sheng
Ma and Yang Guo and Jun Yang",
title = "Priority-Based {PCIe} Scheduling for Multi-Tenant
Multi-{GPU} Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "157--160",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2955119",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multi-GPU systems are widely used in data centers to
provide significant speedups to compute-intensive
workloads such as deep neural network training.
However, limited PCIe bandwidth between the CPU and
multiple GPUs becomes a major performance bottleneck.
We observe that relying on a traditional
Round-Robin-based PCIe scheduling policy can result in
severe bandwidth competition and stall the execution of
multiple GPUs. In this article, we propose a
priority-based scheduling policy which aims to overlap
the data transfers and GPU execution for different
applications to alleviate this bandwidth contention. We
also propose a dynamic priority policy for semi-QoS
management that can help applications to meet QoS
requirements and improve overall multi-GPU system
throughput. Experimental results show that the system
throughput is improved by 7.6 percent on average using
our priority-based PCIe scheduling scheme as compared
with a Round-Robin-based PCIe scheduler. Leveraging
semi-QoS management can help to meet defined QoS goals,
while preserving application throughput.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Data transfer; Graphics processing units;
Multi-GPU; multi-tenant; PCIe scheduling; Quality of
service; Switches; Task analysis; Throughput",
}
@Article{Weng:2019:DMC,
author = "Jian Weng and Sihao Liu and Vidushi Dadu and Tony
Nowatzki",
title = "{DAEGEN}: a Modular Compiler for Exploring Decoupled
Spatial Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "161--165",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2955456",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Specialized hardware accelerators, particularly those
that are programmable and flexible to target multiple
problems in their domain, have proven to provide orders
of magnitude speedup and energy efficiency. However,
their design requires extensive manual effort, due to
the need for hardware-software codesign to balance the
degree and forms of specialization to the domains or
program behaviors of interest. This article provides
the first steps towards one approach for automating
much of these processes. The insight behind our work is
to recognize that decoupled spatial architectures both
define a rich design space with many tradeoffs for
different kinds of applications, and also can be
composed out of a simple set of well-defined
primitives. Therefore, we propose a modular accelerator
design framework, DAEGEN, a.k.a. Decoupled Access
Excution Accelerator Generator. This article defines an
initial compiler and architecture primitives, and we
discuss key challenges.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Delays; design automation;
Hardware; hardware/software co-design; Kernel; Manuals;
Micromechanical devices; Reconfigurable accelerators;
spatial architectures; Synchronization",
}
@Article{Iliakis:2019:LIG,
author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
Soudris",
title = "{LOOG}: Improving {GPU} Efficiency With Light-Weight
Out-Of-Order Execution",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "166--169",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2951161",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "GPUs are one of the most prevalent platforms for
accelerating general-purpose workloads due to their
intuitive programming model, computing capacity, and
cost-effectiveness. GPUs rely on massive
multi-threading and fast context switching to overlap
computations with memory operations. Among the diverse
GPU workloads, there exists a class of kernels that
fail to maintain a sufficient number of active warps to
hide the latency of memory operations, and thus suffer
from frequent stalling. We observe that these kernels
will benefit from increased levels of Instruction-Level
Parallelism and we propose a novel architecture with
lightweight Out-Of-Order execution capability. To
minimize hardware overheads, we carefully design our
extension to highly re-use the existing
micro-architectural structures. We show that the
proposed architecture outperforms traditional platforms
by 15 to 46 percent on average for low occupancy
kernels, with an area overhead of 0.74 to 3.94 percent.
Finally, we prove the potential of our proposal as a
GPU u-arch alternative, by providing a 5 percent
speedup over a wide collection of 63 general-purpose
kernels with as little as 0.74 percent area overhead.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Copper; GPGPU; Graphics processing units; Kernel;
micro-architecture; Out of order; Out-of-Order
execution; Radio access technologies; Radio frequency;
Registers",
}
@Article{Matsuo:2019:IIF,
author = "Reoma Matsuo and Ryota Shioya and Hideki Ando",
title = "Improving the Instruction Fetch Throughput with
Dynamically Configuring the Fetch Pipeline",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "170--173",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2952592",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Instruction cache misses are the critical performance
bottleneck in the execution of recent workloads such as
Web applications written in JavaScript and server
applications. Although various instruction prefetchers
have been proposed to reduce the misses, the
requirements for both high miss coverage and small
hardware cost are not satisfied. In this article, we
propose a novel method that improves the instruction
fetch throughput not by instruction prefetching but by
dynamically configuring the fetch pipeline structure.
Our scheme switches between the normal pipeline and
newly introduced miss-assuming pipeline, which does not
degrade the fetch throughput even when L1 instruction
cache misses occur. Our method achieves high
instruction fetch throughput with simple hardware and
small cost unlike previously proposed prefetchers. Our
evaluation results using Web and database workloads
show that our method improves the performance by 16.6
percent and 8.6 percent on average, compared to that
with noprefetching and the state-of-the-art instruction
prefetcher, PIF, respectively, and achieves as much as
79.0 percent of the performance of the processor with a
perfect instruction cache.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cache storage; Instruction fetch; pipeline
implementation; Pipelines; Prefetching; Servers;
Throughput",
}
@Article{Kommareddy:2019:CMS,
author = "Vamsee Reddy Kommareddy and Baogang Zhang and Fan Yao
and Rickard Ewetz and Amro Awad",
title = "Are Crossbar Memories Secure? {New} Security
Vulnerabilities in Crossbar Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "174--177",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2952111",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memristors are emerging Non-Volatile Memories (NVMs)
that are promising for building future memory systems.
Unlike DRAM, memristors are non-volatile, i.e., they
can retain data after power loss. In contrast to DRAM
where each cell is associated with a pass transistor,
memristor cells can be implemented without such
transistor, and hence enable high density ReRAM
systems. Moreover, memristors leverage a unique
crossbar architecture to improve the density of memory
modules. Memristors have been considered to build
future data centers with both energy-efficiency and
high memory capacity goals. Surprisingly, we observe
that using memristors in multi-tenant environments,
e.g., cloud systems, entails new security
vulnerabilities. In particular, the crossbar contents
can severely affect the write latency of any data cells
within the same crossbar. With various memory
interleaving options (to optimize performance), a
single crossbar might be shared among several
applications/users from different security domains.
Therefore, such content-dependent latency can open new
source of information leakage. In this article, we
describe the information leakage problem in memristor
crossbar arrays (MCAs), discuss how they can be
potentially exploited from application level. Our work
highlights the need for future research to mitigate
(and potentially eliminate) information leakage in
crossbar memories in future computing systems.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Crossbar memory; Memristors;
Microprocessors; Nonvolatile memory; Random access
memory; ReRAM; Security; security",
}
@Article{Barber:2019:ISD,
author = "Kristin Barber and Anys Bacha and Li Zhou and Yinqian
Zhang and Radu Teodorescu",
title = "Isolating Speculative Data to Prevent Transient
Execution Attacks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "18",
number = "2",
pages = "178--181",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2916328",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware security has recently re-surfaced as a
first-order concern to the confidentiality protections
of computing systems. Meltdown and Spectre introduced a
new class of exploits that leverage transient state as
an attack surface and have revealed fundamental
security vulnerabilities of speculative execution in
high-performance processors. These attacks derive
benefit from the fact that programs may speculatively
execute instructions outside their legal control flows.
This insight is then utilized for gaining access to
restricted data and exfiltrating it by means of a
covert channel. This study presents a
microarchitectural mitigation technique for shielding
transient state from covert channels during speculative
execution. Unlike prior work that has focused on
closing individual covert channels used to leak
sensitive information, this approach prevents the use
of speculative data by downstream instructions until
doing so is determined to be safe. This prevents
transient execution attacks at a cost of 18 percent
average performance degradation.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "covert timing channels; Delays; Hardware security;
Law; Pipelines; Registers; Security; Transient
analysis; transient execution attacks",
}
@Article{Kang:2020:NPP,
author = "Ki-Dong Kang and Gyeongseo Park and Nam Sung Kim and
Daehoon Kim",
title = "Network Packet Processing Mode-Aware Power Management
for Data Center Servers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2019.2926079",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Cavus:2020:EPP,
author = "Mustafa Cavus and Mohammed Shatnawi and Resit Sendag
and Augustus K. Uht",
title = "Exploring Prefetching, Pre-Execution and Branch
Outcome Streaming for In-Memory Database Lookups",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2019.2959982",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Bodduna:2020:BRS,
author = "Rahul Bodduna and Vinod Ganesan and Patanjali SLPSK
and Kamakoti Veezhinathan and Chester Rebeiro",
title = "{Brutus}: Refuting the Security Claims of the Cache
Timing Randomization Countermeasure Proposed in
{CEASER}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2964212",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2020:TSA,
author = "Minsub Kim and Jaeha Kung and Sungjin Lee",
title = "Towards Scalable Analytics with Inference-Enabled
Solid-State Drives",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "13--17",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2019.2930590",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Li:2020:CDE,
author = "Congmiao Li and Jean-Luc Gaudiot",
title = "Challenges in Detecting an Evasive Spectre",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "18--21",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2976069",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Yan:2020:CUG,
author = "Mingyu Yan and Zhaodong Chen and Lei Deng and Xiaochun
Ye and Zhimin Zhang and Dongrui Fan and Yuan Xie",
title = "Characterizing and Understanding {GCNs} on {GPU}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "22--25",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2970395",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kumar:2020:PSM,
author = "Chanchal Kumar and Aayush Chaudhary and Shubham
Bhawalkar and Utkarsh Mathur and Saransh Jain and Adith
Vastrad and Eric Rotenberg",
title = "Post-Silicon Microarchitecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "26--29",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2978841",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Eyerman:2020:BOB,
author = "Stijn Eyerman and Wim Heirman and Sam Van den Steen
and Ibrahim Hur",
title = "Breaking In-Order Branch Miss Recovery",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "30--33",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2980277",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Liu:2020:STA,
author = "Zhi-Gang Liu and Paul N. Whatmough and Matthew
Mattina",
title = "Systolic Tensor Array: an Efficient Structured-Sparse
{GEMM} Accelerator for Mobile {CNN} Inference",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "34--37",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2979965",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Krishnan:2020:SLV,
author = "Srivatsan Krishnan and Zishen Wan and Kshitij Bhardwaj
and Paul Whatmough and Aleksandra Faust and Gu-Yeon Wei
and David Brooks and Vijay Janapa Reddi",
title = "The Sky Is Not the Limit: a Visual Performance Model
for Cyber-Physical Co-Design in Autonomous Machines",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "38--42",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2981022",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Michaud:2020:ETT,
author = "Pierre Michaud",
title = "Exploiting Thermal Transients With Deterministic Turbo
Clock Frequency",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "43--46",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2983920",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Chu:2020:HPD,
author = "Zhufei Chu and Huiming Tian and Zeqiang Li and Yinshui
Xia and Lunyao Wang",
title = "A High-Performance Design of Generalized Pipeline
Cellular Array",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "47--50",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2986197",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhu:2020:HIR,
author = "Lingjun Zhu and Lennart Bamberg and Anthony Agnesina
and Francky Catthoor and Dragomir Milojevic and Manu
Komalan and Julien Ryckaert and Alberto Garcia-Ortiz
and Sung Kyu Lim",
title = "Heterogeneous {$3$D} Integration for a {RISC-V} System
With {STT-MRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "51--54",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2992644",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mason:2020:UPI,
author = "Tony Mason and Thaleia Dimitra Doudali and Margo
Seltzer and Ada Gavrilovska",
title = "Unexpected Performance of {Intel Optane DC} Persistent
Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "55--58",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2987303",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhang:2020:AIG,
author = "Zhihui Zhang and Jingwen Leng and Lingxiao Ma and
Youshan Miao and Chao Li and Minyi Guo",
title = "Architectural Implications of Graph Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "59--62",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2988991",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sartor:2020:HHL,
author = "Anderson L. Sartor and Anish Krishnakumar and Samet E.
Arda and Umit Y. Ogras and Radu Marculescu",
title = "{HiLITE}: Hierarchical and Lightweight Imitation
Learning for Power Management of Embedded {SoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "63--67",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2992182",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Desai:2020:PAH,
author = "Harsh Desai and Brandon Lucia",
title = "A Power-Aware Heterogeneous Architecture Scaling Model
for Energy-Harvesting Computers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "68--71",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2989440",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lai:2020:TDB,
author = "Bo-Cheng Lai and Chun-Yen Chen and Yi-Da Hsin and
Bo-Yen Lin",
title = "A Two-Directional {BigData} Sorting Architecture on
{FPGAs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "72--75",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2993040",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Gu:2020:NTC,
author = "Peng Gu and Benjamin S. Lim and Wenqin Huangfu and
Krishan T. Malladi and Andrew Chang and Yuan Xie",
title = "{NMTSim}: Transaction-Command Based Simulator for New
Memory Technology Devices",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "76--79",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2995167",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Rezaei:2020:NNM,
author = "Seyyed Hossein SeyyedAghaei Rezaei and Mehdi
Modarressi and Rachata Ausavarungnirun and Mohammad
Sadrosadati and Onur Mutlu and Masoud Daneshtalab",
title = "{NoM}: Network-on-Memory for Inter-Bank Data Transfer
in Highly-Banked Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "80--83",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2990599",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2020:IIC,
author = "Anonymous",
title = "2019 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 18",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "1",
pages = "1--8",
month = jan # "\slash " # jun,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2964168",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ros:2020:EIP,
author = "Alberto Ros and Alexandra Jimborean",
title = "The Entangling Instruction Prefetcher",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "84--87",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3002947",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Singh:2020:VLB,
author = "Rahul Singh and Gokul Subramanian Ravi and Mikko
Lipasti and Joshua San Miguel",
title = "Value Locality Based Approximation With {ODIN}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "88--91",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3002542",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhang:2020:FRP,
author = "Jie Zhang and Miryeong Kwon and Sanghyun Han and Nam
Sung Kim and Mahmut Kandemir and Myoungsoo Jung",
title = "{FastDrain}: Removing Page Victimization Overheads in
{NVMe} Storage Stack",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "92--96",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3005507",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Im:2020:PBA,
author = "Junsu Im and Hanbyeol Kim and Yumin Won and Jiho Oh
and Minjae Kim and Sungjin Lee",
title = "Probability-Based Address Translation for Flash
{SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "97--100",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3006529",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Samara:2020:CDS,
author = "Ahmed Samara and James Tuck",
title = "The Case for Domain-Specialized Branch Predictors for
Graph-Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3005895",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mirosanlou:2020:MED,
author = "Reza Mirosanlou and Danlu Guo and Mohamed Hassan and
Rodolfo Pellizzoni",
title = "{MCsim}: an Extensible {DRAM} Memory Controller
Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "105--109",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3008288",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Li:2020:DCA,
author = "Shang Li and Zhiyuan Yang and Dhiraj Reddy and Ankur
Srivastava and Bruce Jacob",
title = "{DRAMsim3}: a Cycle-Accurate, Thermal-Capable {DRAM}
Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "106--109",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.2973991",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lee:2020:SFA,
author = "Joo Hwan Lee and Hui Zhang and Veronica Lagrange and
Praveen Krishnamoorthy and Xiaodong Zhao and Yang Seok
Ki",
title = "{SmartSSD}: {FPGA} Accelerated Near-Storage Data
Analytics on {SSD}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "110--113",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3009347",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sutradhar:2020:PPP,
author = "Purab Ranjan Sutradhar and Mark Connolly and Sathwika
Bavikadi and Sai Manoj Pudukotai Dinakarrao and Mark A.
Indovina and Amlan Ganguly",
title = "{pPIM}: a Programmable Processor-in-Memory
Architecture With Precision-Scaling for Deep Learning",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "118--121",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3011643",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Choe:2020:SMP,
author = "Wonkyo Choe and Jonghyeon Kim and Jeongseob Ahn",
title = "A Study of Memory Placement on Hardware-Assisted
Tiered Memory Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "122--125",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3015613",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lachtar:2020:CSA,
author = "Nada Lachtar and Abdulrahman Abu Elkhail and Anys
Bacha and Hafiz Malik",
title = "A Cross-Stack Approach Towards Defending Against
Cryptojacking",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "126--129",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3017457",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Golshan:2020:HPC,
author = "Fatemeh Golshan and Mohammad Bakhshalipour and Mehran
Shakerinava and Ali Ansari and Pejman Lotfi-Kamran and
Hamid Sarbazi-Azad",
title = "Harnessing Pairwise-Correlating Data Prefetching With
Runahead Metadata",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "130--133",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3019343",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lazarev:2020:DTE,
author = "Nikita Lazarev and Neil Adit and Shaojie Xiang and
Zhiru Zhang and Christina Delimitrou",
title = "{Dagger}: Towards Efficient {RPCs} in Cloud
Microservices With Near-Memory Reconfigurable {NICs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "134--138",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3020064",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jahanshahi:2020:GNC,
author = "Ali Jahanshahi and Hadi Zamani Sabzi and Chester Lau
and Daniel Wong",
title = "{GPU-NEST}: Characterizing Energy Efficiency of
Multi-{GPU} Inference Servers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "139--142",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3023723",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mikhailenko:2020:ASA,
author = "Darya Mikhailenko and Yujin Nakamoto and Ben Feinberg
and Engin Ipek",
title = "Adapting In Situ Accelerators for Sparsity with
Granular Matrix Reordering",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "143--146",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3031907",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ishii:2020:RIP,
author = "Yasuo Ishii and Jaekyu Lee and Krishnendra Nathella
and Dam Sunwoo",
title = "Rebasing Instruction Prefetching: an Industry
Perspective",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "147--150",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3035068",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Newton:2020:PGP,
author = "Newton and Virendra Singh and Trevor E. Carlson",
title = "{PIM-GraphSCC}: {PIM}-Based Graph Processing Using
Graph's Community Structures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "151--154",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3039498",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Chowdhury:2020:VNM,
author = "Zamshed I. Chowdhury and S. Karen Khatamifard and
Zhaoyong Zheng and Tali Moreshet and R. Iris Bahar and
Ulya R. Karpuzcu",
title = "Voltage Noise Mitigation With Barrier Approximation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "155--158",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3040088",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Che:2020:LMA,
author = "Yuezhi Che and Yuanzhou Yang and Amro Awad and Rujia
Wang",
title = "A Lightweight Memory Access Pattern Obfuscation
Framework for {NVM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "163--166",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3041484",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sadredini:2020:ESP,
author = "Elaheh Sadredini and Reza Rahimi and Kevin Skadron",
title = "Enabling In-{SRAM} Pattern Processing With
Low-Overhead Reporting Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "167--170",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3042194",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sharifi:2020:AAC,
author = "Ferdous Sharifi and Nezam Rohbani and Shaahin
Hessabi",
title = "Aging-Aware Context Switching in Multicore Processors
Based on Workload Classification",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "19",
number = "2",
pages = "159--162",
month = jul # "\slash " # dec,
year = "2020",
DOI = "https://doi.org/10.1109/LCA.2020.3040326",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2021:IIC,
author = "Anonymous",
title = "2020 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 19",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "1--7",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3048555",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kwon:2021:FQM,
author = "Hyoukjun Kwon and Michael Pellauer and Angshuman
Parashar and Tushar Krishna",
title = "{Flexion}: a Quantitative Metric for Flexibility in
{DNN} Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3044607",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2021:TTR,
author = "Byeongho Kim and Jaehyun Park and Eojin Lee and Minsoo
Rhu and Jung Ho Ahn",
title = "{TRiM}: Tensor Reduction in Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3042805",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Boran:2021:FGS,
author = "Nirmal Kumar Boran and Shubhankit Rathore and Meet
Udeshi and Virendra Singh",
title = "Fine-Grained Scheduling in Heterogeneous-{ISA}
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3045056",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Resch:2021:DLQ,
author = "Salonik Resch and Swamit Tannu and Ulya R. Karpuzcu
and Moinuddin Qureshi",
title = "A Day In the Life of a Quantum Error",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3045628",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shan:2021:ACP,
author = "Mohsin Shan and Omer Khan",
title = "Accelerating Concurrent Priority Scheduling Using
Adaptive in-Hardware Task Distribution in Multicores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "17--21",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3045670",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Perais:2021:CSS,
author = "Arthur Perais",
title = "A Case for Speculative Strength Reduction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "22--25",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3048694",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Navarro:2021:HSS,
author = "Marta Navarro and Lucia Pons and Julio Sahuquillo",
title = "{Hy-Sched}: a Simple Hyperthreading-Aware Thread to
Core Allocation Strategy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "26--29",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3051393",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Alian:2021:IOI,
author = "Mohammad Alian and Jongmin Shin and Ki-Dong Kang and
Ren Wang and Alexandros Daglis and Daehoon Kim and Nam
Sung Kim",
title = "{IDIO}: Orchestrating Inbound Network Data on Server
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "30--33",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2020.3044923",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2021:RSD,
author = "Hweesoo Kim and Sunjung Lee and Jaewan Choi and Jung
Ho Ahn",
title = "Row-Streaming Dataflow Using a Chaining Buffer and
Systolic Array+ Structure",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "34--37",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3054371",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kasan:2021:CDB,
author = "Hans Kasan and John Kim",
title = "The Case for Dynamic Bias in Global Adaptive Routing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "38--41",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3061408",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shah:2021:TDS,
author = "Parth Shah and Ranjal Gautham Shenoy and Vaidyanathan
Srinivasan and Pradip Bose and Alper Buyuktosunoglu",
title = "{TokenSmart}: Distributed, Scalable Power Management
in the Many-Core Era",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "42--45",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3064441",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Li:2021:RRA,
author = "Qian Li and Bin Li and Pietro Mercati and Ramesh
Illikkal and Charlie Tai and Michael Kishinevsky and
Christos Kozyrakis",
title = "{RAMBO}: Resource Allocation for Microservices Using
{Bayesian} Optimization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "46--49",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3066142",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2021:ZCS,
author = "Sunghwan Kim and Gyusun Lee and Jiwon Woo and Jinkyu
Jeong",
title = "Zero-Copying {I/O} Stack for Low-Latency {SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "50--53",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3064876",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Yu:2021:MDC,
author = "Chao Yu and Sihang Liu and Samira Khan",
title = "{MultiPIM}: a Detailed and Configurable Multi-Stack
Processing-In-Memory Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "54--57",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3061905",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Tan:2021:FQF,
author = "Tian Tan and Eriko Nurvitadhi and Aravind Dasu and
Martin Langhammer and Derek Chiou",
title = "{FlexScore}: Quantifying Flexibility",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "58--4",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3076413",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jul 8 12:08:28 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sarkar:2021:DDA,
author = "Arindam Sarkar and Newton Singh and Varun Venkitaraman
and Virendra Singh",
title = "{DAM}: Deadblock Aware Migration Techniques for
{STT-RAM}-Based Hybrid Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "62--4",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3071717",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jul 8 12:08:28 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Li:2021:HAG,
author = "Han Li and Mingyu Yan and Xiaocheng Yang and Lei Deng
and Wenming Li and Xiaochun Ye and Dongrui Fan and Yuan
Xie",
title = "Hardware Acceleration for {GCNs} via Bidirectional
Fusion",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "66--4",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3077956",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jul 8 12:08:28 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jang:2021:DPT,
author = "Yongjoo Jang and Sejin Kim and Daehoon Kim and Sungjin
Lee and Jaeha Kung",
title = "Deep Partitioned Training From Near-Storage Computing
to {DNN} Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "70--73",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3081752",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jul 8 12:08:28 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Resch:2021:CPC,
author = "Salonik Resch and Husrev Cilasun and Ulya R.
Karpuzcu",
title = "Cryogenic {PIM}: Challenges Opportunities",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "74--77",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3077536",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jul 8 12:08:28 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Heirman:2021:RRC,
author = "Wim Heirman and Stijn Eyerman and Kristof {Du Bois}
and Ibrahim Hur",
title = "{RIO}: {ROB}-Centric In-Order Modeling of Out-of-Order
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "1",
pages = "78--81",
month = jan # "\slash " # jun,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3084365",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jul 8 12:08:28 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Amarnath:2021:HAS,
author = "Aporva Amarnath and Subhankar Pal and Hiwot Tadese
Kassa and Augusto Vega and Alper Buyuktosunoglu and
Hubertus Franke and John-David Wellman and Ronald
Dreslinski and Pradip Bose",
title = "Heterogeneity-Aware Scheduling on {SoCs} for
Autonomous Vehicles",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "82--85",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3085505",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Aug 10 15:14:44 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Wang:2021:WWP,
author = "Lei Wang and Xingwang Xiong and Jianfeng Zhan and
Wanling Gao and Xu Wen and Guoxin Kang and Fei Tang",
title = "{WPC}: Whole-Picture Workload Characterization Across
Intermediate Representation, {ISA}, and
Microarchitecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "86--89",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3087828",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Aug 10 15:14:44 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Eyerman:2021:MDT,
author = "Stijn Eyerman and Wim Heirman and Ibrahim Hur",
title = "Modeling {DRAM} Timing in Parallel Simulators With
Immediate-Response Memory Model",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "90--93",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3093075",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Aug 10 15:14:44 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Falahati:2021:DAC,
author = "Hajar Falahati and Masoud Peyro and Hossein Amini and
Mehran Taghian and Mohammad Sadrosadati and Pejman
Lotfi-Kamran and Hamid Sarbazi-Azad",
title = "Data-Aware Compression of Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "94--97",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3096191",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Aug 10 15:14:44 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Wu:2021:GOD,
author = "Benjamin Wu and Trishita Tiwari and G. Edward Suh and
Aaron B. Wagner",
title = "Guessing Outputs of Dynamically Pruned {CNNs} Using
Memory Access Patterns",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "98--101",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3101505",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Yoo:2021:MBU,
author = "Mingi Yoo and Jaeyong Song and Jounghoo Lee and
Namhyung Kim and Youngsok Kim and Jinho Lee",
title = "Making a Better Use of Caches for {GCN} Accelerators
with Feature Slicing and Automatic Tile Morphing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "102--105",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3090954",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Hyun:2021:CAD,
author = "Bongjoon Hyun and Jiwon Lee and Minsoo Rhu",
title = "Characterization and Analysis of Deep Learning for
{3D} Point Cloud Analytics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "106--109",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3099117",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Rucker:2021:CTB,
author = "Alexander Rucker and Muhammad Shahbaz and Kunle
Olukotun",
title = "Chopping off the Tail: Bounded Non-Determinism for
Real-Time Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "110--113",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3102224",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Su:2021:EPA,
author = "Jiya Su and Linfeng He and Peng Jiang and Rujia Wang",
title = "Exploring {PIM} Architecture for High-Performance
Graph Pattern Mining",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "114--117",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3103665",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lee:2021:UIN,
author = "Yunjae Lee and Youngeun Kwon and Minsoo Rhu",
title = "Understanding the Implication of Non-Volatile Memory
for Large-Scale Graph Neural Network Training",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "118--121",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3098943",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Munoz-Martinez:2021:SEC,
author = "Francisco Mu{\~n}oz-Mart{\'\i}nez and Jos{\'e} L.
Abell{\'a}n and Manuel E. Acacio and Tushar Krishna",
title = "{STONNE}: Enabling Cycle-Level Microarchitectural
Simulation for {DNN} Inference Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "122--125",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3097253",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shoghi:2021:SSQ,
author = "Nima Shoghi and Andrei Bersatti and Moinuddin Qureshi
and Hyesoon Kim",
title = "{SmaQ}: Smart Quantization for {DNN} Training by
Exploiting Value Clustering",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "126--129",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3108505",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Volos:2021:CRA,
author = "Haris Volos",
title = "The Case for Replication-Aware Memory-Error Protection
in Disaggregated Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "130--133",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3110439",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Asheim:2021:BXS,
author = "Truls Asheim and Boris Grot and Rakesh Kumar",
title = "{BTB-X}: a Storage-Effective {BTB} Organization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "134--137",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3109945",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kumar:2021:DDS,
author = "Pratik Kumar and Chavhan Sujeet Yashavant and
Biswabandan Panda",
title = "{DAMARU}: a Denial-of-Service Attack on Randomized
Last-Level Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "138--141",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3112180",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ghasemi:2021:MPE,
author = "Fatemeh Ghasemi and Magnus Jahre",
title = "Modeling Periodic Energy-Harvesting Computing
Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "142--145",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3117031",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kalani:2021:ICB,
author = "Neelu Shivprakash Kalani and Biswabandan Panda",
title = "Instruction Criticality Based Energy-Efficient
Hardware Data Prefetching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "146--149",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3117005",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2021:DSR,
author = "Jiho Kim and Myoungsoo Jung and John Kim",
title = "Decoupled {SSD}: Reducing Data Movement on
{NAND}-Based Flash {SSD}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "150--153",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3118688",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lee:2021:LPM,
author = "Hyeon Gyu Lee and Minwook Kim and Juwon Lee and Eunji
Lee and Bryan S. Kim and Sungjin Lee and Yeseong Kim
and Sang Lyul Min and Jin-Soo Kim",
title = "Learned Performance Model for {SSD}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "154--157",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3120728",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Gurumurthi:2021:HRE,
author = "Sudhanva Gurumurthi and Kijun Lee and Munseon Jang and
Vilas Sridharan and Aaron Nygren and Yesin Ryu and
Kyomin Sohn and Taekyun Kim and Hoeju Chung",
title = "{HBM3 RAS}: Enhancing Resilience at Scale",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "158--161",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3117150",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Aimoniotis:2021:RBC,
author = "Pavlos Aimoniotis and Christos Sakalis and Magnus
Sj{\"a}lander and Stefanos Kaxiras",
title = "Reorder Buffer Contention: a Forward Speculative
Interference Attack for Speculation Invariant
Instructions",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "162--165",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3123408",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Nabavinejad:2021:BLB,
author = "Seyed Morteza Nabavinejad and Sherief Reda",
title = "{BayesTuner}: Leveraging {Bayesian} Optimization For
{DNN} Inference Configuration Selection",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "166--170",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3123695",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ham:2021:NDP,
author = "Hyungkyu Ham and Hyunuk Cho and Minjae Kim and Jueon
Park and Jeongmin Hong and Hyojin Sung and Eunhyeok
Park and Euicheol Lim and Gwangsun Kim",
title = "Near-Data Processing in Memory Expander for {DNN}
Acceleration on {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "171--174",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3126450",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Liu:2021:SMS,
author = "Wenjie Liu and Wim Heirman and Stijn Eyerman and
Shoaib Akram and Lieven Eeckhout",
title = "Scale-Model Simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "20",
number = "2",
pages = "175--178",
month = jul # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/LCA.2021.3133112",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2022:IIC,
author = "Anonymous",
title = "2021 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 20",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "1",
pages = "1--8",
month = jan # "\slash " # jun,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3141948",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Xie:2022:MSS,
author = "Xinfeng Xie and Peng Gu and Jiayi Huang and Yufei Ding
and Yuan Xie",
title = "{MPU-Sim}: a Simulator for In-{DRAM} Near-Bank
Processing Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2021.3135557",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zou:2022:AGP,
author = "Mo Zou and Mingzhe Zhang and Rujia Wang and Xian-He
Sun and Xiaochun Ye and Dongrui Fan and Zhimin Tang",
title = "Accelerating Graph Processing With Lightweight
Learning-Based Data Reordering",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3151087",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Barber:2022:PSA,
author = "Kristin Barber and Moein Ghaniyoun and Yinqian Zhang
and Radu Teodorescu",
title = "A Pre-Silicon Approach to Discovering
Microarchitectural Vulnerabilities in Security Critical
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3151256",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lee:2022:MES,
author = "Dusol Lee and Duwon Hong and Wonil Choi and Jihong
Kim",
title = "{MQSim-E}: an Enterprise {SSD} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3144773",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lucas:2022:LHI,
author = "Benjamin J. Lucas and Ali Alwan and Marion Murzello
and Yazheng Tu and Pengzhou He and Andrew J. Schwartz
and David Guevara and Ujjwal Guin and Kyle Juretus and
Jiafeng Xie",
title = "Lightweight Hardware Implementation of Binary
Ring-{LWE} {PQC} Accelerator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3160394",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Apr 14 17:00:32 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shin:2022:RSA,
author = "Yongwon Shin and Juseong Park and Jeongmin Hong and
Hyojin Sung",
title = "Runtime Support for Accelerating {CNN} Models on
Digital {DRAM} Processing-in-Memory Hardware",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "33--36",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3182363",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jin:2022:MPC,
author = "Hoyong Jin and Donghun Jeong and Taewon Park and Jong
Hwan Ko and Jungrae Kim",
title = "Multi-Prediction Compression: an Efficient and
Scalable Memory Compression Framework for {GP-GPU}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "37--40",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3177419",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kokkinis:2022:DOC,
author = "Argyris Kokkinis and Dionysios Diamantopoulos and
Kostas Siozios",
title = "Dynamic Optimization of On-Chip Memories for {HLS}
Targeting Many-Accelerator Platforms",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "41--44",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3190048",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Yun:2022:GND,
author = "Sungmin Yun and Byeongho Kim and Jaehyun Park and
Hwayong Nam and Jung Ho Ahn and Eojin Lee",
title = "{GraNDe}: Near-Data Processing Architecture With
Adaptive Matrix Mapping for Graph Convolutional
Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3182387",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ma:2022:FBA,
author = "Rui Ma and Evangelos Georganas and Alexander Heinecke
and Sergey Gribok and Andrew Boutros and Eriko
Nurvitadhi",
title = "{FPGA-Based} {AI} Smart {NICs} for Scalable
Distributed {AI} Training Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3189207",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Hameed:2022:DPA,
author = "Fazal Hameed and Asif Ali Khan and Sebastien Ollivier
and Alex K. Jones and Jeronimo Castrillon",
title = "{DNA} Pre-Alignment Filter Using Processing Near
Racetrack Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3194263",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Yang:2022:SEP,
author = "Ling Yang and Libo Huang and Run Yan and Nong Xiao and
Sheng Ma and Li Shen and Weixia Xu",
title = "Stride Equality Prediction for Value Speculation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3195411",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Hong:2022:OMC,
author = "Jeongmin Hong and Sungjun Cho and Gwangsun Kim",
title = "Overcoming Memory Capacity Wall of {GPUs} With
Heterogeneous Memory Stack",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3196932",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Piccolboni:2022:ASS,
author = "Luca Piccolboni and Davide Giri and Luca P. Carloni",
title = "Accelerators \& Security: The Socket Approach",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "65--68",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3179947",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Yan:2022:CUH,
author = "Mingyu Yan and Mo Zou and Xiaocheng Yang and Wenming
Li and Xiaochun Ye and Dongrui Fan and Yuan Xie",
title = "Characterizing and Understanding {HGNNs} on {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "69--72",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3198281",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Accetti:2022:SCE,
author = "Cecil Accetti and Rendong Ying and Peilin Liu",
title = "Structured Combinators for Efficient Graph Reduction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "73--76",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3198844",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Omori:2022:OSH,
author = "Yu Omori and Keiji Kimura",
title = "Open-Source Hardware Memory Protection Engine
Integrated With {NVMM} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "77--80",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3197777",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2022:CSD,
author = "Minjae Kim and Bryan S. Kim and Eunji Lee and Sungjin
Lee",
title = "A Case Study of a {DRAM-NVM} Hybrid Memory Allocator
for Key--Value Stores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "81--84",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3197654",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Wang:2022:ISE,
author = "Zhengrong Wang and Christopher Liu and Tony Nowatzki",
title = "{Infinity Stream}: Enabling Transparent and Automated
In-Memory Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "85--88",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3203064",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Wu:2022:DCG,
author = "Lingxi Wu and Rasool Sharifi and Ashish Venkat and
Kevin Skadron",
title = "{DRAM-CAM}: General-Purpose Bit-Serial Exact Pattern
Matching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "89--92",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3201168",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Resch:2022:VSQ,
author = "Salonik Resch and Ulya Karpuzcu",
title = "On Variable Strength Quantum {ECC}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "93--96",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3200204",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Salvesen:2022:LAR,
author = "Peter Salvesen and Magnus Jahre",
title = "{LMT}: Accurate and Resource-Scalable Slowdown
Prediction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "97--100",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3203483",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shin:2022:OOS,
author = "Gyeongcheol Shin and Junsoo Kim and Joo-Young Kim",
title = "{OpenMDS}: an Open-Source Shell Generation Framework
for High-Performance Design on {Xilinx} Multi-Die
{FPGAs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3202016",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jalili:2022:MPD,
author = "Majid Jalili and Mattan Erez",
title = "Managing Prefetchers With Deep Reinforcement
Learning",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3210397",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lenjani:2022:PAH,
author = "Marzieh Lenjani and Alif Ahmed and Kevin Skadron",
title = "{Pulley}: an Algorithm\slash Hardware Co-Optimization
for In-Memory Sorting",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "109--112",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3208255",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Sorting is an important kernel that requires many
passes on data, where each pass imposes significant
data movement overhead. Processing in memory (PIM) can
reduce this data movement overhead while providing high
parallelism. The radix sorting algorithm is scalable
and can exploit PIM's parallelism. However, this
algorithm is inefficient for current PIM-based
accelerators for three reasons: (i) requiring a large
intermediate array per processing unit, wasting
capacity, (ii) requiring a prefix-sum operation across
all the large intermediate arrays, imposing performance
overhead, and (iii) requiring significant random
accesses, which are costly in PIM. In this paper, we
propose an algorithm and hardware co-optimization for
sorting that enable every group of processing elements
to cooperatively share and generate an intermediate
array, reducing the capacity overhead of intermediate
arrays and performance overhead of the prefix-sum
operation. To prevent the shared array from becoming a
bottleneck due to random accesses, we eliminate random
accesses by adding a local sorting step to the radix
sorting and providing efficient hardware support for
this step. On average, our hardware/algorithm
optimizations, Pulley, deliver 20$ \times $ speedup
compared to Bonsai, an FPGA-based sorting accelerator,
and 13$ \times $ speedup compared to IMC, an
in-logic-layer-based sorting accelerator.",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhu:2022:RBP,
author = "Yongye Zhu and Shijia Wei and Mohit Tiwari",
title = "Revisiting Browser Performance Benchmarking From an
Architectural Perspective",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "113--116",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3210483",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Gouk:2022:PHA,
author = "Donghyun Gouk and Seungkwan Kang and Miryeong Kwon and
Junhyeok Jang and Hyunkyu Choi and Sangwon Lee and
Myoungsoo Jung",
title = "{PreGNN}: Hardware Acceleration to Take Preprocessing
Off the Critical Path in Graph Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "117--120",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3193256",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Wang:2022:CIR,
author = "Yinshen Wang and Wenming Li and Tianyu Liu and
Liangjiang Zhou and Bingnan Wang and Zhihua Fan and
Xiaochun Ye and Dongrui Fan and Chibiao Ding",
title = "Characterization and Implementation of Radar System
Applications on a Reconfigurable Dataflow
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "121--124",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3215595",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Hou:2022:CUE,
author = "Xiaofeng Hou and Cheng Xu and Jiacheng Liu and Xuehan
Tang and Lingyu Sun and Chao Li and Kwang-Ting Cheng",
title = "Characterizing and Understanding End-to-End
Multi-Modal Neural Networks on {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "125--128",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3215718",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Nye:2022:SSS,
author = "Jared Nye and Omer Khan",
title = "{SSE}: Security Service Engines to Accelerate Enclave
Performance in Secure Multicore Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "129--132",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3210149",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Chacon:2022:HTT,
author = "Gino A. Chacon and Charles Williams and Johann
Knechtel and Ozgur Sinanoglu and Paul V. Gratz",
title = "Hardware {Trojan} Threats to Cache Coherence in Modern
{2.5D} Chiplet Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "133--136",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3216820",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Eeckhout:2022:FOM,
author = "Lieven Eeckhout",
title = "A First-Order Model to Assess Computer Architecture
Sustainability",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "137--140",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3217366",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhou:2022:LPL,
author = "Ranyang Zhou and Sepehr Tabrizchi and Arman Roohi and
Shaahin Angizi",
title = "{LT-PIM}: an {LUT-Based} {Processing-in-DRAM}
Architecture With {RowHammer} Self-Tracking",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "141--144",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3220084",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Park:2022:SML,
author = "Jongwon Park and Jinkyu Jeong",
title = "Speculative Multi-Level Access in {LSM} Tree-Based
{KV} Store",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "145--148",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3219808",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Fariborz:2022:MSB,
author = "Marjan Fariborz and Mahyar Samani and Terry O'Neill
and Jason Lowe-Power and S. J. Ben Yoo and Venkatesh
Akella",
title = "A Model for Scalable and Balanced Accelerators for
Graph Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "149--152",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3215489",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Huang:2022:EDC,
author = "Jianming Huang and Yu Hua",
title = "Ensuring Data Confidentiality in {eADR-Based} {NVM}
Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "153--156",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3225949",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Kim:2022:SSE,
author = "Sejin Kim and Jungwoo Kim and Yongjoo Jang and Jaeha
Kung and Sungjin Lee",
title = "{SEMS}: Scalable Embedding Memory System for
Accelerating Embedding-Based {DNNs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "21",
number = "2",
pages = "157--160",
month = jul # "\slash " # dec,
year = "2022",
DOI = "https://doi.org/10.1109/LCA.2022.3227560",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jimenez:2023:LLC,
author = "Daniel A. Jim{\'e}nez and Elvira Teran and Paul V.
Gratz",
title = "Last-Level Cache Insertion and Promotion Policy in the
Presence of Aggressive Prefetching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3242178",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Moon:2023:AAD,
author = "Yaebin Moon and Wanju Doh and Kwanhee Kyung and Eojin
Lee and Jung Ho Ahn",
title = "{ADT}: Aggressive Demotion and Promotion for Tiered
Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3236685",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Park:2023:CEE,
author = "Gyeongseo Park and Ki-Dong Kang and Minho Kim and
Daehoon Kim",
title = "{CoreNap}: Energy Efficient Core Allocation for
Latency-Critical Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2022.3227629",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sim:2023:CCM,
author = "Joonseop Sim and Soohong Ahn and Taeyoung Ahn and
Seungyong Lee and Myunghyun Rhee and Jooyoung Kim and
Kwangsik Shin and Donguk Moon and Euiseok Kim and
Kyoung Park",
title = "Computational {CXL-Memory} Solution for Accelerating
Memory-Intensive Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2022.3226482",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ringlein:2023:ACD,
author = "Burkhard Ringlein and Francois Abel and Dionysios
Diamantopoulos and Beat Weiss and Christoph Hagleitner
and Dietmar Fey",
title = "Advancing Compilation of {DNNs} for {FPGAs} Using
Operation Set Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2022.3227643",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lee:2023:HHF,
author = "Seonho Lee and Ranggi Hwang and Jongse Park and Minsoo
Rhu",
title = "{HAMMER}: Hardware-Friendly Approximate Computing for
Self-Attention With Mean-Redistribution and
Linearization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2022.3233832",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Bae:2023:ISF,
author = "Hanyeoreum Bae and Donghyun Gouk and Seungjun Lee and
Jiseon Kim and Sungjoon Koh and Jie Zhang and Myoungsoo
Jung",
title = "Intelligent {SSD} Firmware for Zero-Overhead
Journaling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3243695",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Zhao:2023:RAL,
author = "Xia Zhao and Guangda Zhang and Lu Wang and Yangmei Li
and Yongjun Zhang",
title = "{RouteReplies}: Alleviating Long Latency in
Many-Chip-Module {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3255555",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Weston:2023:SLI,
author = "Kevin Weston and Farabi Mahmud and Vahid Janfaza and
Abdullah Muzahid",
title = "{SmartIndex}: Learning to Index Caches to Improve
Performance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3264478",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Khoram:2023:EEB,
author = "Soroosh Khoram and Kyle Daruwalla and Mikko Lipasti",
title = "Energy-Efficient {Bayesian} Inference Using Bitstream
Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3238584",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Brana:2023:KSC,
author = "Jennifer Brana and Brian C. Schwedock and Yatin A.
Manerkar and Nathan Beckmann",
title = "{Kobold}: Simplified Cache Coherence for
Cache-Attached Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "1",
pages = "41--44",
month = jan # "\slash " # jun,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3269399",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jeon:2023:HAR,
author = "Kiseok Jeon and Junghee Lee and Bumsoo Kim and James
J. Kim",
title = "Hardware Accelerated Reusable {Merkle} Tree Generation
for Bitcoin Blockchain Headers",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "69--72",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3289515",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lee:2023:CDC,
author = "Hwanjun Lee and Seunghak Lee and Yeji Jung and Daehoon
Kim",
title = "{T-CAT}: Dynamic Cache Allocation for Tiered Memory
Systems With Memory Interleaving",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "73--76",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3290197",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jeong:2023:LLA,
author = "Ipoom Jeong and Jiaqi Lou and Yongseok Son and Yongjoo
Park and Yifan Yuan and Nam Sung Kim",
title = "{LADIO}: Leakage-Aware Direct {I/O} for
{I/O}-Intensive Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "77--80",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3290427",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Deshpande:2023:TPB,
author = "Chandana S. Deshpande and Arthur Perais and
Fr{\'e}d{\'e}ric P{\'e}trot",
title = "Toward Practical 128-Bit General Purpose
Microarchitectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "81--84",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3287762",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Intel introduced 5-level paging mode to support 57-bit
virtual address space in 2017. This, coupled to
paradigms where backup storage can be accessed through
load and store instructions (e.g., non volatile
memories), lets us envision a future in which a 64-bit
address space has become insufficient. In that event,
the straightforward solution would be to adopt a flat
128-bit address space. In this early stage letter, we
conduct high-level experiments that lead us to suggest
a possible general-purpose processor micro-architecture
providing 128-bit support with limited hardware cost.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Tzenetopoulos:2023:DLD,
author = "Achilleas Tzenetopoulos and Dimosthenis Masouros and
Dimitrios Soudris and Sotirios Xydis",
title = "{DVFaaS}: Leveraging {DVFS} for {FaaS} Workflows",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "85--88",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3288089",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Nam:2023:XRD,
author = "Hwayong Nam and Seungmin Baek and Minbok Wi and
Michael Jaemin Kim and Jaehyun Park and Chihun Song and
Nam Sung Kim and Jung Ho Ahn",
title = "{X}-ray: Discovering {DRAM} Internal Structure and
Error Characteristics by Issuing Memory Commands",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "89--92",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3296153",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The demand for accurate information about the internal
structure and characteristics of DRAM has been on the
rise. Recent studies have explored the structure and
characteristics of DRAM to improve processing in
memory, enhance reliability, and mitigate a
vulnerability known as rowhammer. However, DRAM
manufacturers only disclose limited information through
official documents, making it difficult to find
specific information about actual DRAM devices. This
paper presents reliable findings on the internal
structure and characteristics of DRAM using
activate-induced bitflips (AIBs), retention time test,
and row-copy operation. While previous studies have
attempted to understand the internal behaviors of DRAM
devices, they have only shown results without
identifying the causes or have analyzed DRAM modules
rather than individual chips. We first uncover the
size, structure, and operation of DRAM subarrays and
verify our findings on the characteristics of DRAM.
Then, we correct misunderstood information related to
AIBs and demonstrate experimental results supporting
the cause of rowhammer.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Nematallah:2023:ELS,
author = "Ahmed Nematallah and Chang Hyun Park and David
Black-Schaffer",
title = "Exploring the Latency Sensitivity of Cache Replacement
Policies",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "93--96",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3296251",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mosquera:2023:GCC,
author = "Fernando Mosquera and Krishna Kavi and Gayatri Mehta
and Lizy John",
title = "Guard Cache: Creating Noisy Side-Channels",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "97--100",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3289710",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mars:2023:JPP,
author = "Jason Mars and Yiping Kang and Roland Daynauth and
Baichuan Li and Ashish Mahendra and Krisztian Flautner
and Lingjia Tang",
title = "The {Jaseci} Programming Paradigm and Runtime Stack:
Building Scale-Out Production Applications Easy and
Fast",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3274038",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Hossain:2023:SDA,
author = "Naorin Hossain and Alper Buyuktosunoglu and John-David
Wellman and Pradip Bose and Margaret Martonosi",
title = "{SoCurity}: a Design Approach for Enhancing {SoC}
Security",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3301448",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Feng:2023:SOW,
author = "Justin Feng and Fatemeh Arkannezhad and Christopher
Ryu and Enoch Huang and Siddhant Gupta and Nader
Sehatbakhsh",
title = "Simulating Our Way to Safer Software: a Tale of
Integrating Microarchitecture Simulation and Leakage
Estimation Modeling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "109--112",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3303913",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Wed Sep 13 17:35:03 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Choi:2023:UPP,
author = "Jaewan Choi and Jaehyun Park and Kwanhee Kyung and Nam
Sung Kim and Jung Ho Ahn",
title = "Unleashing the Potential of {PIM}: Accelerating Large
Batched Inference of Transformer-Based Generative
Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "113--116",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3305386",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "attention; Computational modeling; Context modeling;
Decoding; Matrix converters; Memory management;
processing-in-memory; Throughput; Transformer-based
generative model; Transformers",
}
@Article{Kim:2023:HAC,
author = "Yonghae Kim and Anurag Kar and Jaewon Lee and Jaekyu
Lee and Hyesoon Kim",
title = "Hardware-Assisted Code-Pointer Tagging for
Forward-Edge Control-Flow Integrity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "117--120",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3306326",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Authentication; Benchmark testing; CFI; Codes; CPT;
Hardware; memory safety; Prototypes; RISC-V BOOM;
Software; Tagging",
}
@Article{Saileshwar:2023:MBM,
author = "Gururaj Saileshwar and Moinuddin Qureshi",
title = "The Mirage of Breaking {MIRAGE}: Analyzing the
Modeling Pitfalls in Emerging Attacks on {MIRAGE}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "121--124",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3297875",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Cache side-channel attacks;
Ciphers; Codes; Computer bugs; Indexing; randomized
caches; Security; Side-channel attacks",
}
@Article{Lo:2023:LLV,
author = "Yun-Chen Lo and Yu-Chih Tsai and Ren-Shuo Liu",
title = "{LV}: Latency-Versatile Floating-Point Engine for
High-Performance Deep Neural Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "125--128",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3287096",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adders; Approximate computation; Artificial neural
networks; Clocks; Computer architecture; Electric
breakdown; Engines; floating point; latency-versatile
architecture; Registers",
}
@Article{Goudarzi:2023:SBP,
author = "Maziar Goudarzi and Reza Azimi and Julian Humecki and
Faizaan Rehman and Richard Zhang and Chirag Sethi and
Tanishq Bomman and Yuqi Yang",
title = "By-Software Branch Prediction in Loops",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "129--132",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3304613",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C.0.b hardware/software interfaces; C.1.1.b pipeline
processors; C.1.5.a instruction fetch; Codes; D.3.4.b
compilers; Hardware; Monitoring; Optimization; Program
processors; Software; Target tracking",
}
@Article{Yun:2023:FPP,
author = "Yugyoung Yun and Eunhyeok Park",
title = "Fast Performance Prediction for Efficient Distributed
{DNN} Training",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "133--136",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3316452",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D parallelism; Costs; Distributed training; large
language model; Optimization; Parallel processing;
Performance evaluation; performance modeling; Tensors;
Throughput; Training",
}
@Article{Wu:2023:CUD,
author = "Meng Wu and Mingyu Yan and Xiaocheng Yang and Wenming
Li and Zhimin Zhang and Xiaochun Ye and Dongrui Fan",
title = "Characterizing and Understanding Defense Methods for
{GNNs} on {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "137--140",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3304638",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "defense; Electric breakdown; Estimation; execution
pattern; execution semantic; Graph neural networks;
Graphics processing units; Kernel; overhead;
Perturbation methods; Purification; Training",
}
@Article{Patel:2023:TIP,
author = "Pratyush Patel and Zibo Gong and Syeda Rizvi and Esha
Choukse and Pulkit Misra and Thomas Anderson and
Akshitha Sriraman",
title = "Towards Improved Power Management in Cloud {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "141--144",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3278652",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Cloud computing; design for power delivery
limits; Graphics processing units; graphics processors;
Monitoring; Performance evaluation; Power management;
Power system management; servers; Servers; super (very
large) computers",
}
@Article{Zhang:2023:BPA,
author = "Shiqing Zhang and Mahmood Naderan-Tahan and Magnus
Jahre and Lieven Eeckhout",
title = "Balancing Performance Against Cost and Sustainability
in Multi-Chip-Module {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "145--148",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3313203",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Aggregates; Bandwidth; Costs; Graphics processing
units; Manufacturing; Sustainable development;
Switches",
}
@Article{Park:2023:DHP,
author = "Chanyoung Park and Chun-Yi Liu and Kyungtae Kang and
Mahmut Kandemir and Wonil Choi",
title = "Design of a High-Performance, High-Endurance Key-Value
{SSD} for Large-Key Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "149--152",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3282276",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Blogs; Data structures; Key-value SSD; large-key
workloads; Micromechanical devices; Performance
evaluation; Random access memory; Social networking
(online); Tail",
}
@Article{Liu:2023:ILG,
author = "Jie Liu and Zhongyuan Zhao and Zijian Ding and
Benjamin Brock and Hongbo Rong and Zhiru Zhang",
title = "An Intermediate Language for General Sparse Format
Customization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "153--156",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3262610",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Codes; Compilers; Hardware; heterogeneous (hybrid)
systems; Indexes; Kernel; Layout; Metadata; sparse
linear algebra; specialized application languages;
Tensors",
}
@Article{Lee:2023:NPR,
author = "Seunghak Lee and Ki-Dong Kang and Gyeongseo Park and
Nam Sung Kim and Daehoon Kim",
title = "{NoHammer}: Preventing Row Hammer With Last-Level
Cache Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "157--160",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3320670",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Degradation; DRAM; Indexes; Last-level cache
management; Memory management; Proposals; Random access
memory; reliability; Reverse engineering; row hammer;
Threat modeling",
}
@Article{Escofet:2023:HQA,
author = "Pau Escofet and Anabel Ovide and Carmen G. Almudever
and Eduard Alarc{\'o}n and Sergi Abadal",
title = "{Hungarian} Qubit Assignment for Optimized Mapping of
Quantum Circuits on Multi-Core Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "161--164",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3318857",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Computers; Costs; Logic gates;
Mapping of quantum algorithms; multi-core quantum
computing architectures; Partitioning algorithms;
Quantum computing; quantum computing; Qubit",
}
@Article{Lu:2023:FEA,
author = "Lingfei Lu and Yudi Qiu and Shiyan Yi and Yibo Fan",
title = "A Flexible Embedding-Aware Near Memory Processing
Architecture for Recommendation System",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "165--168",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3305668",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer architecture; data partition;
Fans; Kernel; near memory processing; Random access
memory; Recommendation system; Recommender systems;
Social networking (online)",
}
@Article{Li:2023:HFT,
author = "Hailong Li and Jaewan Choi and Yongsuk Kwon and Jung
Ho Ahn",
title = "A Hardware-Friendly Tiled Singular-Value
Decomposition-Based Matrix Multiplication for
Transformer-Based Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "169--172",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3323482",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; GPU; Graphics processing
units; Kernel; Matrix decomposition; Natural language
processing; Task analysis; tiled singular vector
decomposition; Transformer-based model; Transformers",
}
@Article{Hastings:2023:ASR,
author = "Adam Hastings and Ryan Piersma and Simha
Sethumadhavan",
title = "Architectural Security Regulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "173--176",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3327952",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Costs; Games; Government; Modeling techniques;
Regulation; Regulators; Safety; Security; security
regulation; support for security",
}
@Article{Trochatos:2023:QCT,
author = "Theodoros Trochatos and Chuanqi Xu and Sanjay
Deshpande and Yao Lu and Yongshan Ding and Jakub
Szefer",
title = "A Quantum Computer Trusted Execution Environment",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "177--180",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3325852",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Attenuation; Cloud computing; cloud computing;
Computer security; control pulses; Cryptography;
dilution refrigerator; Hardware; Logic gates;
obfuscation; Quantum computing; quantum computing;
Qubit; RF switches",
}
@Article{Wu:2023:RAI,
author = "Peiyun Wu and Trung Le and Zhichun Zhu and Zhao
Zhang",
title = "Redundant Array of Independent Memory Devices",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "181--184",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3334989",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Data transfer; Error correction codes; Layout; Memory
management; Memory systems; mini-rank; multi-bit
errors; Organizations; parity; Performance evaluation;
redundant array; Standards organizations",
}
@Article{Garcia-Mallen:2023:TAD,
author = "Jonathan Garcia-Mallen and Shuohao Ping and Alex
Miralles-Cordal and Ian Martin and Mukund Ramakrishnan
and Yipeng Huang",
title = "Towards an Accelerator for Differential and Algebraic
Equations Useful to Scientists",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "22",
number = "2",
pages = "185--188",
month = jul # "\slash " # dec,
year = "2023",
DOI = "https://doi.org/10.1109/LCA.2023.3332318",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Convergence; Differential equations; Field
programmable gate arrays; Hardware; Hyperbolic
equations; Iterative methods; iterative methods;
reconfigurable hardware; Registers; Scientific
computing",
}
@Article{Vieira:2024:GAP,
author = "Jo{\~a}o Vieira and Nuno Roma and Gabriel Falcao and
Pedro Tom{\'a}s",
title = "{gem5-accel}: a Pre-{RTL} Simulation Toolchain for
Accelerator Architecture Validation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3329443",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator modeling; Central Processing Unit;
complete system emulation; Computer architecture;
Hardware acceleration; Kernel; Process control; Random
access memory; Registers; Simulation toolchain",
}
@Article{Gheibi-Fetrat:2024:TTF,
author = "Atiyeh Gheibi-Fetrat and Negar Akbarzadeh and Shaahin
Hessabi and Hamid Sarbazi-Azad",
title = "{Tulip}: Turn-Free Low-Power Network-on-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3339646",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "area; Chip Multiprocessor (CMP); crossbar; Integrated
circuits; Mesh networks; Network topology;
Network-on-chip; Network-on-Chip (NoC); power
consumption; router; Routing; System recovery;
System-on-Chip (SoC); Topology",
}
@Article{Ueno:2024:ITB,
author = "Yosuke Ueno and Yuna Tomida and Teruo Tanimoto and
Masamitsu Tanaka and Yutaka Tabuchi and Koji Inoue and
Hiroshi Nakamura",
title = "Inter-Temperature Bandwidth Reduction in Cryogenic
{QAOA} Machines",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3322700",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer architecture; cryogenic
electronics; Cryogenics; Logic gates; Quantum
computing; quantum computing; Qubit; qubit;
Superconducting cables; superconducting logic
circuits",
}
@Article{Kim:2024:FAD,
author = "Hyeseong Kim and Yunjae Lee and Minsoo Rhu",
title = "{FPGA}-Accelerated Data Preprocessing for Personalized
Recommendation Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "7--10",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3336841",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Data models; Data preprocessing; data preprocessing;
Feature extraction; FPGA; Graphics processing units;
Personalized recommendation system; Servers;
Throughput; training; Training",
}
@Article{Peltekis:2024:DDM,
author = "Christodoulos Peltekis and Vasileios Titopoulos and
Chrysostomos Nicopoulos and Giorgos Dimitrakopoulos",
title = "{DeMM}: a Decoupled Matrix Multiplication Engine
Supporting Relaxed Structured Sparsity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3355178",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; Engines; Hardware; Indexes;
Machine learning accelerator; matrix-multiplication
engine; Organizations; Sparse matrices; structured
sparsity; Systolic arrays; systolic computation",
}
@Article{Corontzos:2024:DCD,
author = "Caden Corontzos and Eitan Frachtenberg",
title = "Direct-Coding {DNA} With Multilevel Parallelism",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3355109",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Decoding; DNA; DNA encoding; Encoding; Genomics;
Instruction sets; parallel architectures; Random access
memory; Throughput",
}
@Article{Ayanzadeh:2024:ERR,
author = "Ramin Ayanzadeh and Moinuddin Qureshi",
title = "Enhancing the Reach and Reliability of Quantum
Annealers by Pruning Longer Chains",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3340030",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adiabatic quantum computing; Annealing; Computers;
embedding; Hardware; power-law; quantum annealers;
Quantum annealing; Quantum circuit; Quantum computing;
Qubit",
}
@Article{Golden:2024:SVV,
author = "Courtney Golden and Dan Ilan and Caroline Huang and
Niansong Zhang and Zhiru Zhang and Christopher Batten",
title = "Supporting a Virtual Vector Instruction Set on a
Commercial Compute-in-{SRAM} Accelerator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3341389",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; hardware/software interfaces;
In-memory computing; Instruction sets; Latches;
Microarchitecture; Process control; Programming;
Registers",
}
@Article{Thomas:2024:BMT,
author = "Samuel Thomas and Kidus Workneh and Ange-Thierry
Ishimwe and Zack McKevitt and Phaedra Curlin and R.
Iris Bahar and Joseph Izraelevitz and Tamara Lehman",
title = "Baobab {Merkle} Tree for Efficient Secure Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3360709",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Encryption; encryption; Indexes;
integrity; Memory management; Metadata; Protocols;
secure memory; Security; System-on-chip",
}
@Article{Cho:2024:EEA,
author = "Minsik Cho and Keivan A. Vahid and Qichen Fu and
Saurabh Adya and Carlo C. {Del Mundo} and Mohammad
Rastegari and Devang Naik and Peter Zatloukal",
title = "{eDKM}: an Efficient and Accurate Train-Time Weight
Clustering for Large Language Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3363492",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "artificial intelligence; Complexity theory;
Computational and artificial intelligence; deep
learning; Graphics processing units; Indexes; learning
systems; machine learning; Memory; Optimization;
Sharding; Tensors",
}
@Article{Kim:2024:ADR,
author = "Yang-Gon Kim and Yun-Ki Han and Jae-Kang Shin and
Jun-Kyum Kim and Lee-Sup Kim",
title = "Accelerating Deep Reinforcement Learning via
Phase-Level Parallelism for Robotics Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "41--44",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3341152",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Backpropagation; Computer systems organization;
Graphics processing units; Hardware; Legged locomotion;
mobile computing; neural nets; Reinforcement learning;
Robots; Training",
}
@Article{Yang:2024:JIJ,
author = "Yuxin Yang and Xiaoming Chen and Yinhe Han",
title = "{JANM-IK}: {Jacobian} Argumented {Nelder--Mead}
Algorithm for Inverse Kinematics and its Hardware
Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "45--48",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3369940",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator; Convergence; End effectors; Field
programmable gate arrays; inverse kinematics; Jacobian;
Jacobian matrices; Kinematics; nelder-mead;
Perturbation methods; Robotics; Robots;
software-hardware co-design",
}
@Article{Hafezan:2024:IEE,
author = "Mohammad Hafezan and Ehsan Atoofian",
title = "Improving Energy-Efficiency of Capsule Networks on
Modern {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "49--52",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3365149",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "CapsNet; Computer architecture; energy-efficiency;
GPU; Graphics processing units; Hidden Markov models;
Instruction sets; Matrix converters; Registers; tensor
core; Vectors",
}
@Article{Nagabhiru:2024:AFP,
author = "Mahita Nagabhiru and Gregory T. Byrd",
title = "Achieving Forward Progress Guarantee in Small Hardware
Transactions",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "53--56",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3370992",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Atomics; Coherence; compare-and-swap; concurrency;
Data structures; forward progress; Hardware; hardware
transactional memory; Instruction sets; lock-free;
multi-word-compare-and-swap; multithreading;
non-blocking; Programming; Protocols; Software",
}
@Article{Ma:2024:PFA,
author = "Rui Ma and Jia-Ching Hsu and Ali Mansoorshahi and
Joseph Garvey and Michael Kinsner and Deshanand Singh
and Derek Chiou",
title = "{Primate}: a Framework to Automatically Generate Soft
Processors for Network Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "57--60",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3358839",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Codes; Design methodology; domain-specific
accelerators; Field programmable gate arrays;
flexibility; Libraries; programmability; Registers;
Software; Throughput; VLIW",
}
@Article{France:2024:RSA,
author = "Lo{\"\i}c France and Florent Bruguier and David Novo
and Maria Mushtaq and Pascal Benoit",
title = "Reducing the Silicon Area Overhead of Counter-Based
Rowhammer Mitigations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "61--64",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3328824",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Capacitors; Computer security; DRAM; Proposals; Random
access memory; rowhammer; Security; Silicon; Timing;
Transistors",
}
@Article{Yavits:2024:DCD,
author = "L. Yavits",
title = "{DRAMA}: Commodity {DRAM} Based Content Addressable
Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "65--68",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3341830",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "CAM; DNA; DRAM; Hardware; Humanities; Random access
memory; Three-dimensional displays; Timing; Voltage",
}
@Article{Mishra:2024:ASA,
author = "Deepanjali Mishra and Konstantinos Kanellopoulos and
Ashish Panwar and Akshitha Sriraman and Vivek Seshadri
and Onur Mutlu and Todd C. Mowry",
title = "Address Scaling: Architectural Support for
Fine-Grained Thread-Safe Metadata Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "69--72",
month = jan # "\slash " # jun,
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3373760",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Complexity theory; Computer bugs; Data structures;
dynamic program monitoring tools; Hardware;
intermediate address space; Metadata; metadata
management; Monitoring; Synthetic aperture sonar;
Virtual memory",
}
@Article{Shin:2024:CMR,
author = "Changmin Shin and Taehee Kwon and Jaeyong Song and Jae
Hyung Ju and Frank Liu and Yeonkyu Choi and Jinho Lee",
title = "A Case for In-Memory Random Scatter--Gather for Fast
Graph Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "73--77",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3376680",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerator architectures; Bandwidth; Computer
architecture; in-memory computing; memory architecture;
Memory management; parallel processing; Protocols;
Random access memory; random access memory; Random
sequences; Standards",
}
@Article{Eeckhout:2024:RPG,
author = "Lieven Eeckhout",
title = "{R.I.P.} Geomean Speedup Use Equal-Work (Or
Equal-Time) Harmonic Mean Speedup Instead",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "78--82",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3361925",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic; Average; Benchmark testing; Computer
architecture; Harmonic analysis; Measurement;
performance metrics; Research and development; speedup;
Workstations",
}
@Article{Jahshan:2024:MMB,
author = "Z. Jahshan and L. Yavits",
title = "{MajorK}: Majority Based kmer Matching in Commodity
{DRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "83--86",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3384259",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "$K$ mer matching; Bioinformatics; Computer
architecture; Databases; DNA; DRAM; genome
classification; Genomics; Microprocessors; Random
access memory",
}
@Article{Yi:2024:GSM,
author = "Shiyan Yi and Yudi Qiu and Lingfei Lu and Guohao Xu
and Yong Gong and Xiaoyang Zeng and Yibo Fan",
title = "{GATe}: Streamlining Memory Access and Communication
to Accelerate Graph Attention Network With Near-Memory
Processing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "87--90",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3386734",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "DRAM; Fans; Graph attention network; Logic gates;
Mathematical models; near memory processing;
Optimization; Random access memory; Social networking
(online); Vectors",
}
@Article{Sasmal:2024:AMD,
author = "Mrinmay Sasmal and Tresa Joseph and Bindiya T. S.",
title = "Approximate Multiplier Design With {LFSR}-Based
Stochastic Sequence Generators for Edge {AI}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "91--94",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3379002",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "approximate multiplier (AM); Artificial neural
networks; Computer architecture; Generators; Hardware;
linear feedback shift register (LFSR); Long short term
memory; long short term memory (LSTM); matrix vector
multiplier (MVM); Neural networks; Stochastic
processes; Streams",
}
@Article{Gohil:2024:IGM,
author = "Varun Gohil and Sundar Dev and Gaurang Upasani and
David Lo and Parthasarathy Ranganathan and Christina
Delimitrou",
title = "The Importance of Generalizability in Machine Learning
for Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "95--98",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3384449",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Bayes methods; Bayesian neural networks;
Computational modeling; Data models; generalizability;
Internet; machine learning for systems; Predictive
models; Uncertainty; uncertainty estimation",
}
@Article{Agarwal:2024:UTU,
author = "Nikhil Agarwal and Mitchell Fream and Souradip Ghosh
and Brian C. Schwedock and Nathan Beckmann",
title = "{UDIR}: Towards a Unified Compiler Framework for
Reconfigurable Dataflow Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "99--103",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3342130",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Codes; Computer architecture; Dataflow; Hardware;
intermediate representation; Optimization; Program
processors; reconfigurable architectures; Semantics;
Synchronization",
}
@Article{Tsantikidou:2024:AEA,
author = "Kyriaki Tsantikidou and Nicolas Sklavos",
title = "An Area Efficient Architecture of a Novel Chaotic
System for High Randomness Security in e-Health",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "104--107",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3387352",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu May 2 08:20:13 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Area efficient architecture; Chaotic communication;
chaotic system; Ciphers; Computer architecture;
e-health; Encryption; high randomness; key scheduling;
NIST; Protocols; Security; security; stream cipher",
}
@Article{Park:2024:DND,
author = "Yongmo Park and Subhankar Pal and Aporva Amarnath and
Karthik Swaminathan and Wei D. Lu and Alper
Buyuktosunoglu and Pradip Bose",
title = "{Dramaton}: a Near-{DRAM} Accelerator for Large Number
Theoretic Transforms",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "108--111",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3381452",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cryptography; Geometry; Hardware; hardware
accelerators; Layout; near-DRAM processing; number
theoretic transform; Parallel processing; Post-quantum
cryptography; Random access memory; Transforms",
}
@Article{Luo:2024:RMM,
author = "Haocong Luo and Yahya Can Tu{\u{g}}rul and F. Nisa
Bostanc{\i} and Ataberk Olgun and A. Giray
Ya{\u{g}}l{\i}k{\c{c}}{\i} and Onur Mutlu",
title = "{Ramulator 2.0}: a Modern, Modular, and Extensible
{DRAM} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "112--116",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3333759",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C++ languages; Codes; Computer architecture; computer
simulation systems engineering and theory; computers
and information processing; DRAM systems engineering
and theory; Extensibility; memory; memory architecture
computers and information processing; memory management
computers and information processing; modeling;
Organizations; Random access memory; random access
memory; scalability; simulation; Software architecture;
system analysis and design; system simulation systems
engineering and theory; Timing",
}
@Article{Kim:2024:EIA,
author = "Hyungyo Kim and Gaohan Ye and Nachuan Wang and Amir
Yazdanbakhsh and Nam Sung Kim",
title = "Exploiting {Intel Advanced Matrix Extensions (AMX)}
for Large Language Model Inference",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "117--120",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3397747",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "advance matrix extensions; Arithmetic; Computational
modeling; cooperative heterogeneous computing; Data
models; Data transfer; Graphics processing units; Large
language models; Memory management; Throughput",
}
@Article{Li:2024:TLV,
author = "Tianzheng Li and Enfang Cui and Yuting Wu and Qian Wei
and Yue Gao",
title = "{TeleVM}: a Lightweight Virtual Machine for {RISC-V}
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "121--124",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3394835",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Hardware; Hypervisor;
Registers; RISC-V; Security; serverless; Virtual
machine monitors; Virtual machining; Virtualization;
virtualization",
}
@Article{Qi:2024:AIG,
author = "Yingjie Qi and Jianlei Yang and Ao Zhou and Tong Qiao
and Chunming Hu",
title = "Architectural Implications of {GNN} Aggregation
Programming Abstractions",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "125--128",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3326170",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "characterization; execution patterns; Graph neural
networks; Graph neural networks (GNNs); Graphics
processing units; Indexes; Kernel; Organizations;
Programming; programming abstractions; Taxonomy",
}
@Article{Khan:2024:EML,
author = "Asif Ali Khan and Fazal Hameed and Taha Shahroodi and
Alex K. Jones and Jeronimo Castrillon",
title = "Efficient Memory Layout for Pre-Alignment Filtering of
Long {DNA} Reads Using Racetrack Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "129--132",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3350701",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bioinformatics; DNA; Domain wall memory; Filtering;
Filtering algorithms; Genomics; Layout; near memory
computing; racetrack memory; sequence alignment;
Sequential analysis",
}
@Article{Maji:2024:SCP,
author = "Saurav Maji and Kyungmi Lee and Anantha P.
Chandrakasan",
title = "{SparseLeakyNets}: Classification Prediction Attack
Over Sparsity-Aware Embedded Neural Networks Using
Timing Side-Channel Information",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "133--136",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2024.3397730",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Architectural attacks; Arrhythmia; Artificial neural
networks; classification prediction; Data mining;
Electrocardiography; Hardware; neural networks;
side-channel attacks; System-on-chip; Timing; timing
side-channel",
}
@Article{Rezaei:2024:SMD,
author = "Seyyed Hossein SeyyedAghaei Rezaei and Parham
Zilouchian Moghaddam and Mehdi Modarressi",
title = "Smart Memory: Deep Learning Acceleration in
{3D}-Stacked Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "23",
number = "1",
pages = "137--141",
year = "2024",
DOI = "https://doi.org/10.1109/LCA.2023.3287976",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Sat Aug 24 09:55:05 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D-stacked memory; Artificial neural networks;
Bandwidth; Computer architecture; deep learning
accelerator; Memory management; Network-on-memory;
processing-in-memory; Random access memory; Switches;
Three-dimensional displays",
}