@Preamble{"\input bibnames.sty" #
"\ifx \undefined \circled \def \circled #1{(#1)} \fi" #
"\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi" #
"\ifx \undefined \reg \def \reg {\circled{R}} \fi" #
"\def \TM {${}^{\sc TM}$}"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-TRETS = "ACM Transactions on Reconfigurable Technology
and Systems (TRETS)"}
@Article{Buell:2008:I,
author = "Duncan Buell and Wayne Luk",
title = "Introduction",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1331898",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{DeHon:2008:GET,
author = "Andr{\'e} DeHon and Mike Hutton",
title = "Guest Editorial: {TRETS} Special Edition on the {15th
International Symposium on FPGAs}",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1341292",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Matsumoto:2008:SID,
author = "Yohei Matsumoto and Masakazu Hioki and Takashi
Kawanami and Hanpei Koike and Toshiyuki Tsutsumi and
Tadashi Nakagawa and Toshihiro Sekigawa",
title = "Suppression of Intrinsic Delay Variation in {FPGAs}
using Multiple Configurations",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1331899",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A new method for improving the timing yield of
field-programmable gate array (FPGA) devices affected
by intrinsic within-die variation is proposed. The
timing variation is reduced by selecting an appropriate
configuration for each chip from a set of independent
configurations, the critical paths of which do not
share the same circuit resources on the FPGA. In this
article, the actual method used to generate independent
multiple configurations by simply repeating the routing
phase is shown, along with the results of Monte Carlo
simulation with 10,000 samples. One simulation result
showed that the standard deviations of maximum critical
path delays are reduced by 28\% and 49\% for 10\% and
30\% V$_{th}$ variations ($ \sigma / \mu $ ),
respectively, with 10 independent configurations.
Therefore, the proposed method is especially effective
for larger V$_{th}$ variation and is expected to be
useful for suppressing the performance variation of
FPGAs due to the future increase of parameter
variation. Another simulation result showed that the
effectiveness of the proposed technique was saturated
at the use of 10 or more configurations because of the
degradation of the quality of the configurations.
Therefore, the use of 10 or fewer configurations is
reasonable.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "configuration; FPGA; timing yield; within-die
variation",
}
@Article{Sivaswamy:2008:SAP,
author = "Satish Sivaswamy and Kia Bazargan",
title = "Statistical Analysis and Process Variation-Aware
Routing and Skew Assignment for {FPGAs}",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1331900",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "With constant scaling of process technologies, chip
design is becoming increasingly difficult due to
process variations. The FPGA community has only
recently started focusing on the effects of variations.
In this work we present a statistical analysis to
compare the effects of variations on designs mapped to
FPGAs and ASICs. We also present CAD and architecture
techniques to mitigate the impact of variations. First
we present a variation-aware router that optimizes
statistical criticality. We then propose a modification
to the clock network to deliver programmable skews to
different flip-flops. Finally, we combine the two
techniques and the result is a 9x reduction in yield
loss that translates to a 12\% improvement in timing
yield. When the desired timing yield is set to 99\%,
our combined statistical routing and skew assignment
technique results in a delay improvement of about 10\%
over a purely deterministic approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "routing; skew assignment; statistical timing
analysis",
}
@Article{Lu:2008:DCR,
author = "Shih-Lien L. Lu and Peter Yiannacouras and Taeweon Suh
and Rolf Kassa and Michael Konow",
title = "A Desktop Computer with a Reconfigurable
{Pentium\reg}",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1331901",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Advancements in reconfigurable technologies,
specifically FPGAs, have yielded faster, more
power-efficient reconfigurable devices with enormous
capacities. In our work, we provide testament to the
impressive capacity of recent FPGAs by hosting a
complete Pentium$^{\reg }$ in a single FPGA chip. In
addition we demonstrate how FPGAs can be used for
microprocessor design space exploration while
overcoming the tension between simulation speed, model
accuracy, and model completeness found in traditional
software simulator environments. Specifically, we
perform preliminary experimentation/prototyping with an
original Socket 7 based desktop processor system with
typical hardware peripherals running modern operating
systems such as Fedora Core 4 and Windows XP; however
we have inserted a Xilinx Virtex-4 in place of the
processor that should sit in the motherboard and have
used the Virtex-4 to host a complete version of the
Pentium$^{\reg }$ microprocessor (which consumes less
than half its resources). We can therefore apply
architectural changes to the processor and evaluate
their effects on the complete desktop system. We use
this FPGA-based emulation system to conduct preliminary
architectural experiments including growing the branch
target buffer and the level 1 caches. In addition, we
experimented with interfacing hardware accelerators
such as DES and AES engines which resulted in a 27x
speedup.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "accelerator; architecture; emulator; exploration;
FPGA; model; operating system; Pentium processor;
reconfigurable; simulator",
}
@Article{Feng:2008:DEI,
author = "Wenyi Feng and Sinan Kaptanoglu",
title = "Designing Efficient Input Interconnect Blocks for
{LUT} Clusters Using Counting and Entropy",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1331902",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In a cluster-based FPGA, the interconnect from
external routing tracks and cluster feedbacks to the
LUT inputs consumes significant area, and no consensus
has emerged among different implementations (e.g.,
1-level or 2-level). In this paper, we model this
interconnect as a unified input interconnect block
(IIB). We identify three types of IIBs and develop
general combinatorial techniques to count the number of
distinct functional configurations for them. We use
entropy, defined as the logarithm of this count, to
estimate an IIB's routing flexibility. This enables us
to analytically evaluate different IIBs without the
customary time-consuming place and route experiments.
We show that both depopulated 1-level IIBs and
VPR-style 2-level IIBs achieve high routing flexibility
but lack area efficiency. We propose a novel class of
highly efficient, yet still simple, IIBs that use
substantially fewer switches with only a small
degradation in routing flexibility. Experimental
results verify the routability of these IIBs, and
confirm that entropy is a good predictor of
routability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "cluster; counting; entropy; FPGAs; interconnect; LUT;
PLDs",
}
@Article{Wilton:2008:SDO,
author = "Steven J. E. Wilton and Chun Hok Ho and Bradley
Quinton and Philip H. W. Leong and Wayne Luk",
title = "A Synthesizable Datapath-Oriented Embedded {FPGA}
Fabric for Silicon Debug Applications",
journal = j-TRETS,
volume = "1",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331897.1331903",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:41 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We present an architecture for a synthesizable
datapath-oriented FPGA core that can be used to provide
post-fabrication flexibility to an SoC. Our
architecture is optimized for bus-based operations and
employs a directional routing architecture, which
allows it to be synthesized using standard ASIC design
tools and flows. The primary motivation for this
architecture is to provide an efficient mechanism to
support on-chip debugging. The fabric can also be used
to implement other datapath-oriented circuits such as
those needed in signal processing and
computation-intensive applications. We evaluate our
architecture using a set of benchmark circuits and
compare it to previous fabrics in terms of area, speed,
and power.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Field programmable gate array; integrated circuit;
silicon debug; system-on-chip",
}
@Article{Guneysu:2008:SPH,
author = "Tim G{\"u}neysu and Christof Paar and Jan Pelzl",
title = "Special-Purpose Hardware for Solving the Elliptic
Curve Discrete Logarithm Problem",
journal = j-TRETS,
volume = "1",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1371579.1371580",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:42 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The resistance against powerful index-calculus attacks
makes Elliptic Curve Cryptosystems (ECC) an interesting
alternative to conventional asymmetric cryptosystems,
like RSA. Operands in ECC require significantly less
bits at the same level of security, resulting in a
higher computational efficiency compared to RSA. With
growing computational capabilities and continuous
technological improvements over the years, however, the
question of the security of ECC against attacks based
on special-purpose hardware arises. In this context,
recently emerged low-cost FPGAs demand for attention in
the domain of hardware-based cryptanalysis: the
extraordinary efficiency of modern programmable
hardware devices allow for a low-budget implementation
of hardware-based ECC attacks---without the requirement
of the expensive development of ASICs.\par
With focus on the aspect of cost-efficiency, this
contribution presents and analyzes an FPGA-based
architecture of an attack against ECC over prime
fields. A multi-processing hardware architecture for
Pollard's Rho method is described. We provide results
on actually used key lengths of ECC (128 bits and
above) and estimate the expected runtime for a
successful attack.\par
As a first result, currently used elliptic curve
cryptosystems with a security of 160 bit and above turn
out to be infeasible to break with available
computational and financial resources. However, some of
the security standards proposed by the Standards for
Efficient Cryptography Group (SECG) become subject to
attacks based on low-cost FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "cryptanalysis; discrete logarithm; elliptic curve
cryptosystem; Pollard's rho",
}
@Article{Jacob:2008:MBA,
author = "Arpith Jacob and Joseph Lancaster and Jeremy Buhler
and Brandon Harris and Roger D. Chamberlain",
title = "{Mercury BLASTP}: Accelerating Protein Sequence
Alignment",
journal = j-TRETS,
volume = "1",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1371579.1371581",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:42 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Large-scale protein sequence comparison is an
important but compute-intensive task in molecular
biology. BLASTP is the most popular tool for
comparative analysis of protein sequences. In recent
years, an exponential increase in the size of protein
sequence databases has required either exponentially
more running time or a cluster of machines to keep
pace. To address this problem, we have designed and
built a high-performance FPGA-accelerated version of
BLASTP, {\em Mercury BLASTP}. In this article, we
describe the architecture of the portions of the
application that are accelerated in the FPGA, and we
also describe the integration of these FPGA-accelerated
portions with the existing BLASTP software. We have
implemented Mercury BLASTP on a commodity workstation
with two Xilinx Virtex-II 6000 FPGAs. We show that the
new design runs 11--15 times faster than software
BLASTP on a modern CPU while delivering close to 99\%
identical results.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "bioinformatics; biological sequence alignment",
}
@Article{Sedcole:2008:PYM,
author = "Pete Sedcole and Peter Y. K. Cheung",
title = "Parametric Yield Modeling and Simulations of {FPGA}
Circuits Considering Within-Die Delay Variations",
journal = j-TRETS,
volume = "1",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1371579.1371582",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:42 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Variations in the semiconductor fabrication process
results in differences in parameters between
transistors on the same die, a problem exacerbated by
lithographic scaling. Field-Programmable Gate Arrays
may be able to compensate for within-die delay
variability, by judicious use of reconfigurability.
This article presents two strategies for compensating
within-die stochastic delay variability by using
reconfiguration: reconfiguring the entire FPGA, and
relocating subcircuits within an FPGA. Analytical
models for the theoretical bounds on the achievable
gains are derived for both strategies and compared to
models for worst-case design as well as statistical
static timing analysis (SSTA). All models are validated
by comparison to circuit-level Monte Carlo simulations.
It is demonstrated that significant improvements in
circuit yield and timing are possible using SSTA alone,
and these improvements can be enhanced by employing
reconfiguration-based techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "delay; FPGA; modeling; process variation;
reconfiguration; statistical theory; within-die
variability; yield",
}
@Article{Gorjiara:2008:MDC,
author = "Bita Gorjiara and Mehrdad Reshadi and Daniel Gajski",
title = "Merged Dictionary Code Compression for {FPGA}
Implementation of Custom Microcoded {PEs}",
journal = j-TRETS,
volume = "1",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1371579.1371583",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:42 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Horizontal Microcoded Architecture (HMA) is a paradigm
for designing programmable high-performance processing
elements (PEs). However, it suffers from large code
size, which can be addressed by compression. In this
article, we study the code size of one of the new
HMA-based technologies called No-Instruction-Set
Computer (NISC). We show that NISC code size can be
several times larger than a typical RISC processor, and
we propose several low-overhead dictionary-based code
compression techniques to reduce its code size. Our
compression algorithm leverages the knowledge of
``don't care'' values in the control words and can
reduce the code size by 3.3 times, on average. Despite
such good results, as shown in this article, these
compression techniques lead to poor FPGA
implementations because they require many on-chip RAMs.
To address this issue, we introduce an FPGA-aware
dictionary-based technique that uses the dual-port
feature of on-chip RAMs to reduce the number of
utilized block RAMs by half. Additionally, we propose
cascading two-levels of dictionaries for code size and
block RAM reduction of large programs. For an MP3
application, a merged, cascaded, three-dictionary
implementation reduces the number of utilized block
RAMs by 4.3 times (76\%) compared to a NISC without
compression. This corresponds to 20\% additional
savings over the best single level dictionary-based
compression.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "dictionary based compression; FPGA; memory
optimization; microcoded architectures;
no-instruction-set computer",
}
@Article{Thomas:2008:MGR,
author = "David B. Thomas and Wayne Luk",
title = "Multivariate {Gaussian} Random Number Generation
Targeting Reconfigurable Hardware",
journal = j-TRETS,
volume = "1",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1371579.1371584",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:42 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The multivariate Gaussian distribution is often used
to model correlations between stochastic time-series,
and can be used to explore the effect of these
correlations across $N$ time-series in Monte-Carlo
simulations. However, generating random correlated
vectors is an $ O(N^2) $ process, and quickly becomes a
computational bottleneck in software simulations. This
article presents an efficient method for generating
vectors in parallel hardware, using $N$ parallel
pipelined components to generate a new vector every $N$
cycles. This method maps well to the embedded block
RAMs and multipliers in contemporary FPGAs,
particularly as extensive testing shows that the
limited bit-width arithmetic does not reduce the
statistical quality of the generated vectors. An
implementation of the architecture in the Virtex-4
architecture achieves a 500MHz clock-rate, and can
support vector lengths up to 512 in the largest
devices. The combination of a high clock-rate and
parallelism provides a significant performance
advantage over conventional processors, with an
xc4vsx55 device at 500MHz providing a 200 times speedup
over an Opteron 2.6GHz using an AMD optimised BLAS
package. In a case study in Delta-Gamma Value-at Risk,
an RC2000 accelerator card using an xc4vsx55 at 400MHz
is 26 times faster than a quad Opteron 2.6GHz SMP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "FPGA; multivariate Gaussian distribution; random
numbers",
}
@Article{Lamoureux:2008:TBP,
author = "Julien Lamoureux and Steven J. E. Wilton",
title = "On the trade-off between power and flexibility of
{FPGA} clock networks",
journal = j-TRETS,
volume = "1",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1391732.1391733",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:44 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "FPGA clock networks consume a significant amount of
power, since they toggle every clock cycle and must be
flexible enough to implement the clocks for a wide
range of different applications. The efficiency of FPGA
clock networks can be improved by reducing this
flexibility; however, reducing the flexibility
introduces stricter constraints during the clustering
and placement stages of the FPGA CAD flow. These
constraints can reduce the overall efficiency of the
final implementation. This article examines the
trade-off between the power consumption and flexibility
of FPGA clock networks.\par
Specifically, this article makes three contributions.
First, it presents a new parameterized clock-network
framework for describing and comparing FPGA clock
networks. Second, it describes new clock-aware
placement techniques that are needed to find a legal
placement satisfying the constraints imposed by the
clock network. Finally, it performs an empirical study
to examine the trade-off between the power consumption
of the clock network and the impact of the CAD
constraints for a number of different clock networks
with varying amounts of flexibility.\par
The results show that the techniques used to produce a
legal placement can have a significant influence on
power and the ability of the placer to find a legal
solution. On average, circuits placed using the most
effective techniques dissipate 5\% less overall energy
and are significantly more likely to be legal than
circuits placed using other techniques. Moreover, the
results show that the architecture of the clock network
is also important. On average, FPGAs with an efficient
clock network are up to 14.6\% more energy efficient
compared to other FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "clock distribution networks; clock-aware placement;
FPGA; low-power design",
}
@Article{Slogsnat:2008:OSH,
author = "David Slogsnat and Alexander Giese and Mondrian
N{\"u}ssle and Ulrich Br{\"u}ning",
title = "An open-source {HyperTransport} core",
journal = j-TRETS,
volume = "1",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1391732.1391734",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:44 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents the design of a generic
HyperTransport (HT) core. HyperTransport is a
packet-based interconnect technology for low-latency,
high-bandwidth point-to-point connections. It is
specially optimized to achieve a very low latency. The
core has been verified in system using an FPGA. This
exhaustive verification and the generic design allow
the mapping to both ASICs and FPGAs. The implementation
described in this work supports a 16-bit link width, as
used by Opteron processors. On a Xilinx Virtex-4 FX60,
the core supports a link frequency of 400 MHz DDR and
offers a maximum bidirectional bandwidth of 3.2GB/s.
The in-system verification has been performed using a
custom FPGA board that has been plugged into a
HyperTransport extension connector (HTX) of a standard
Opteron-based motherboard. HTX slots in Opteron-based
motherboards allow very high-bandwidth, low-latency
communication, since the HTX device is directly
connected to one of the HyperTransport links of the
processor. Performance analysis shows a unidirectional
payload bandwidth of 1.4GB/s and a read latency of 180
ns. The HT core in combination with the HTX board is an
ideal base for prototyping systems and implementing
FPGA coprocessors. The HT core is available as open
source.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "FPGA; HTX; HyperTransport; prototyping; RTL",
}
@Article{Beeckler:2008:PGR,
author = "John S. Beeckler and Warren J. Gross",
title = "Particle graphics on reconfigurable hardware",
journal = j-TRETS,
volume = "1",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1391732.1391735",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:44 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Particle graphics simulations are well suited for
modeling complex phenomena such as water, cloth,
explosions, fire, smoke, and clouds. They are normally
realized in software as part of an interactive graphics
application. The computational complexity of particle
graphics simulations restricts the number of particles
that can be updated in software at interactive frame
rates. This article presents the design and
implementation of a hardware particle graphics engine
for accelerating real-time particle graphics
simulations. We explore the design process,
implementation issues, and limitations of using
field-programmable gate arrays (FPGAs) for the
acceleration of particle graphics. The FPGA particle
engine processes million-particle systems at a rate
from 47 to 112 million particles per second, which
represents one to two orders of magnitude speedup over
a 2.8 GHz CPU. Using three FPGAs, a maximum sustained
performance of 112 million particles per second was
achieved.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "FPGAs; particle systems; reconfigurable computing;
special-purpose architectures",
}
@Article{Grant:2008:PMS,
author = "David Grant and Guy Lemieux",
title = "Perturb $+$ mutate: Semisynthetic circuit generation
for incremental placement and routing",
journal = j-TRETS,
volume = "1",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1391732.1391736",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:44 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "CAD tool designers are always searching for more
benchmark circuits to stress their software. In this
article we present a heuristic method to generate
benchmark circuits specially suited for incremental
place-and-route tools. The method removes part of a
real circuit and replaces it with an altered version of
the same circuit to mimic an incremental design change.
The alteration consists of two steps: {\em mutate\/}
followed by {\em perturb}. The perturb step exactly
preserves as many circuit characteristics as possible.
While perturbing, reproduction of interconnect
locality, a characteristic that is difficult to measure
reliably or reproduce exactly, is controlled using a
new technique, {\em ancestor depth control\/} (ADC).
Perturbing with ADC produces circuits with postrouting
properties that match the best techniques known
to-date. The mutate step produces targetted mutations
resulting in controlled changes to specific circuit
properties (while keeping other properties constant).
We demonstrate one targetted mutation heuristic, scale,
to significantly change circuit size with little change
to other circuit characteristics. The method is simple
enough for inclusion in a CAD tool directly, and fast
enough for use in on-the-fly benchmark generation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "automated development tools; design automation; graph
algorithms; hardware-supporting software; place and
route; testing",
}
@Article{Hsiung:2008:PSB,
author = "Pao-Ann Hsiung and Chao-Sheng Lin and Chih-Feng Liao",
title = "{Perfecto}: a {SystemC}-based design-space exploration
framework for dynamically reconfigurable
architectures",
journal = j-TRETS,
volume = "1",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1391732.1391737",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 4 17:12:44 MST 2008",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "To cope with increasing demands for higher
computational power and greater system flexibility,
dynamically and partially reconfigurable logic has
started to play an important role in embedded systems
and systems-on-chip (SoC). However, when using
traditional design methods and tools, it is difficult
to estimate or analyze the performance impact of
including such reconfigurable logic devices into a
system design. In this work, we present a system-level
framework, called Perfecto, which is able to perform
rapid exploration of different reconfigurable design
alternatives and to detect system performance
bottlenecks. This framework is based on the popular
IEEE standard system-level design language SystemC,
which is supported by most EDA and ESL tools. Given an
architecture model and an application model, Perfecto
uses SystemC {\em transaction-level models\/} (TLMs) to
simulate the system design alternatives automatically.
Different hardware-software copartitioning,
coscheduling, and placement algorithms can be embedded
into the framework for analysis; thus, Perfecto can
also be used to design the algorithms to be used in an
operating system for reconfigurable systems.
Applications to a simple illustration example and a
network security system have shown how Perfecto helps a
designer make intelligent partition decisions, optimize
system performance, and evaluate task placements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "design-space exploration; partitioning; performance
evaluation; placement; reconfigurable systems;
scheduling",
}
@Article{Chin:2009:SDM,
author = "Scott Y. L. Chin and Steven J. E. Wilton",
title = "Static and Dynamic Memory Footprint Reduction for
{FPGA} Routing Algorithms",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "18:1--18:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462587",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents techniques to reduce the static
and dynamic memory requirements of routing algorithms
that target field-programmable gate arrays. During
routing, memory is required to store both architectural
data and temporary routing data. The architectural data
is static, and provides a representation of the
physical routing resources and programmable connections
on the device. We show that by taking advantage of the
regularity in FPGAs, we can reduce the amount of
information that must be explicitly represented,
leading to significant memory savings. The temporary
routing data is dynamic, and contains scoring
parameters and traceback information for each routing
resource in the FPGA. By studying the lifespan of the
temporary routing data objects, we develop several
memory management schemes to reduce this component. To
make our proposals concrete, we applied them to the
routing algorithm in VPR and empirically quantified the
impact on runtime memory footprint, and place and route
time.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "CAD; FPGA; memory; routing; scalability",
}
@Article{Xu:2009:FAR,
author = "Ning-Yi Xu and Xiong-Fei Cai and Rui Gao and Lei Zhang
and Feng-Hsiung Hsu",
title = "{FPGA} Acceleration of {RankBoost} in {Web} Search
Engines",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "19:1--19:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462588",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Search relevance is a key measurement for the
usefulness of search engines. Shift of search relevance
among search engines can easily change a search
company's market cap by tens of billions of dollars.
With the ever-increasing scale of the Web, machine
learning technologies have become important tools to
improve search relevance ranking. RankBoost is a
promising algorithm in this area, but it is not widely
used due to its long training time. To reduce the
computation time for RankBoost, we designed a
FPGA-based accelerator system and its upgraded version.
The accelerator, plugged into a commodity PC, increased
the training speed on MSN search engine data up to
1800x compared to the original software implementation
on a server. The proposed accelerator has been
successfully used by researchers in the search
relevance ranking.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "FPGA; hardware acceleration",
}
@Article{Patterson:2009:STP,
author = "C. D. Patterson and S. W. Ellingson and B. S. Martin
and K. Deshpande and J. H. Simonetti and M. Kavic and
S. E. Cutchin",
title = "Searching for Transient Pulses with the {ETA} Radio
Telescope",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "20:1--20:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462589",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Array-based, direct-sampling radio telescopes have
computational and communication requirements unsuited
to conventional computer and cluster architectures.
Synchronization must be strictly maintained across a
large number of parallel data streams, from A/D
conversion, through operations such as beamforming, to
dataset recording. FPGAs supporting multigigabit serial
I/O are ideally suited to this application. We describe
a recently-constructed radio telescope called ETA
having all-sky observing capability for detecting low
frequency pulses from transient events such as gamma
ray bursts and primordial black hole explosions.
Signals from 24 dipole antennas are processed by a
tiered arrangement of 28 commercial FPGA boards and 4
PCs with FPGA-based data acquisition cards, connected
with custom I/O adapter boards supporting InfiniBand
and LVDS physical links. ETA is designed for unattended
operation, allowing configuration and recording to be
controlled remotely.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Direct sampling radio telescope array; FPGA cluster
computing; RFI mitigation; signal dedispersion",
}
@Article{El-Araby:2009:EPR,
author = "Esam El-Araby and Ivan Gonzalez and Tarek El-Ghazawi",
title = "Exploiting Partial Runtime Reconfiguration for
High-Performance Reconfigurable Computing",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "21:1--21:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462590",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Runtime Reconfiguration (RTR) has been traditionally
utilized as a means for exploiting the flexibility of
High-Performance Reconfigurable Computers (HPRCs).
However, the RTR feature comes with the cost of high
configuration overhead which might negatively impact
the overall performance. Currently, modern FPGAs have
more advanced mechanisms for reducing the configuration
overheads, particularly Partial Runtime Reconfiguration
(PRTR). It has been perceived that PRTR on HPRC systems
can be the trend for improving the performance. In this
work, we will investigate the potential of PRTR on HPRC
by formally analyzing the execution model and
experimentally verifying our analytical findings by
enabling PRTR for the first time, to the best of our
knowledge, on one of the current HPRC systems, Cray
XD1. Our approach is general and can be applied to any
of the available HPRC systems. The paper will conclude
with recommendations and conditions, based on our
conceptual and experimental work, for the optimal
utilization of PRTR as well as possible future usage in
HPRC.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "dynamic partial reconfiguration; field programmable
gate arrays (FPGA); High performance computing;
reconfigurable computing",
}
@Article{Holland:2009:RRA,
author = "Brian Holland and Karthik Nagarajan and Alan D.
George",
title = "{RAT}: {RC} Amenability Test for Rapid Performance
Prediction",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "22:1--22:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462591",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "While the promise of achieving speedup and additional
benefits such as high performance per watt with FPGAs
continues to expand, chief among the challenges with
the emerging paradigm of reconfigurable computing is
the complexity in application design and
implementation. Before a lengthy development effort is
undertaken to map a given application to hardware, it
is important that a high-level parallel algorithm
crafted for that application first be analyzed relative
to the target platform, so as to ascertain the
likelihood of success in terms of potential speedup.
This article presents the RC Amenability Test, or RAT,
a methodology and model developed for this purpose,
supporting rapid exploration and prediction of
strategic design tradeoffs during the formulation stage
of application development.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "formulation methodology; FPGA; performance prediction;
reconfigurable computing; strategic design
methodology",
}
@Article{Murtaza:2009:CBB,
author = "S. Murtaza and A. G. Hoekstra and P. M. A. Sloot",
title = "Compute Bound and {I/O} Bound Cellular Automata
Simulations on {FPGA} Logic",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "23:1--23:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462592",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "FPGA-based computation engines have been used as
Cellular Automata accelerators in the scientific
community for some time now. With the recent
availability of more advanced FPGA logic it becomes
necessary to better understand the mapping of Cellular
Automata to these systems. There are many trade-offs to
consider when mapping a Cellular Automata algorithm
from an abstract system to the physical implementation
using FPGA logic. The trade-offs include both the
available FPGA resources and the Cellular Automata
algorithm's execution time. The most important aspect
is to fully understand the behavior of the specified CA
algorithm in terms of its execution times which are
either compute bound or I/O bound. In this article, we
present a methodology to categorize a specified CA
algorithm as a compute bound or an I/O bound. We take
the methodology further by presenting rigorous analysis
for each of the two cases identifying the various
parameters that control the mapping process and are
defined both by the Cellular Automata algorithm and the
given FPGA hardware specifications. This methodology
helps to predict the performance of running Cellular
Automata algorithms on specific FPGA hardware and to
determine optimal values for the various parameters
that control the mapping process. The model is
validated for both compute and I/O bound
two-dimensional Cellular Automata algorithms. We find
that our model predictions are accurate within 7\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "cellular automata; FPGA-based hardware accelerator;
High-performance computing; lattice Boltzman
simulations",
}
@Article{Bouganis:2009:SOF,
author = "Christos-S. Bouganis and Sung-Boem Park and George A.
Constantinides and Peter Y. K. Cheung",
title = "Synthesis and Optimization of {$2$D} Filter Designs
for Heterogeneous {FPGAs}",
journal = j-TRETS,
volume = "1",
number = "4",
pages = "24:1--24:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1462586.1462593",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:01 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Many image processing applications require fast
convolution of an image with one or more 2D filters.
Field-Programmable Gate Arrays (FPGAs) are often used
to achieve this goal due to their fine grain
parallelism and reconfigurability. However, the
heterogeneous nature of modern reconfigurable devices
is not usually considered during design optimization.
This article proposes an algorithm that explores the
space of possible implementation architectures of 2D
filters, targeting the minimization of the required
area, by optimizing the usage of the different
components in a heterogeneous device. This is achieved
by exploring the heterogeneous nature of modern
reconfigurable devices using a Singular Value
Decomposition based algorithm, which provides an
efficient mapping of filter's implementation
requirements to the heterogeneous components of modern
FPGAs. In the case of multiple 2D filters, the proposed
algorithm also exploits any redundancy that exists
within each filter and between different filters in the
set, leading to designs with minimized area.
Experiments with real filter sets from computer vision
applications demonstrate an average of up to 38\%
reduction in the required area.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "2D filter design; FPGA; reconfigurable logic; Singular
Value Decomposition",
}
@Article{Schaumont:2009:GEI,
author = "Patrick R. Schaumont and Alex K. Jones and Steve
Trimberger",
title = "{Guest Editors}' Introduction to Security in
Reconfigurable Systems Design",
journal = j-TRETS,
volume = "2",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1502781.1502782",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:27 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This special issue on Security in Reconfigurable
Systems Design reports on recent research results in
the design and implementation of trustworthy
reconfigurable systems. Five articles cover topics
including power-efficient implementation of public-key
cryptography, side-channel analysis of electromagnetic
radiation, side-channel resistant design, design of
robust unclonable functions on an FPGA, and Trojan
detection in an FPGA bitstream.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "physically unclonable function; side-channel resistant
design; Trojan; Trustworthy design",
}
@Article{Keller:2009:ECC,
author = "Maurice Keller and Andrew Byrne and William P.
Marnane",
title = "Elliptic Curve Cryptography on {FPGA} for Low-Power
Applications",
journal = j-TRETS,
volume = "2",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1502781.1502783",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:27 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Elliptic curve cryptography has generated a lot of
research interest due to its ability to provide greater
security per bit compared to public key systems such as
RSA. The designer of an elliptic curve hardware
accelerator is faced with many choices at design time,
each of which can impact the performance of the
accelerator in different ways. There are many examples
in the literature of how these design choices can
effect the area and/or speed of an elliptic curve
hardware accelerator. The effect of design choices on
power and energy consumption in elliptic curve hardware
has been less well studied. This article studies the
effect of design choices on the power and energy
consumption of an FPGA-based reconfigurable elliptic
curve hardware accelerator. A reconfigurable processor
has been used for different system parameters and the
power and energy consumption measured. The power and
energy results are presented and compared.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Cryptography; elliptic curves; FPGA; low-power",
}
@Article{McEvoy:2009:IWH,
author = "Robert P. McEvoy and Colin C. Murphy and William P.
Marnane and Michael Tunstall",
title = "Isolated {WDDL}: a Hiding Countermeasure for
Differential Power Analysis on {FPGAs}",
journal = j-TRETS,
volume = "2",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1502781.1502784",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:27 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Security protocols are frequently accelerated by
implementing the underlying cryptographic functions in
reconfigurable hardware. However, unprotected hardware
implementations are susceptible to side-channel
attacks, and Differential Power Analysis (DPA) has been
shown to be especially powerful. In this work, we
evaluate and compare the effectiveness of common hiding
countermeasures against DPA in FPGA-based designs,
using the Whirlpool hash function as a case study. In
particular, we develop a new design flow called
Isolated WDDL (IWDDL). In contrast with previous works,
IWDDL isolates the direct and complementary circuit
paths, and also provides DPA resistance in the Hamming
distance power model. The analysis is supported using
actual implementation results.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "DPA; FPGA; secure logic; Side-channel attacks;
Whirlpool",
}
@Article{Sauvage:2009:ERF,
author = "Laurent Sauvage and Sylvain Guilley and Yves Mathieu",
title = "Electromagnetic Radiations of {FPGAs}: High Spatial
Resolution Cartography and Attack on a Cryptographic
Module",
journal = j-TRETS,
volume = "2",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1502781.1502785",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:27 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Since the first announcement of a Side Channel
Analysis (SCA) about ten years ago, considerable
research has been devoted to studying these attacks on
Application Specific Integrated Circuits (ASICs), such
as smart cards or TPMs. In this article, we compare
power-line attacks with ElectroMagnetic (EM) attacks,
specifically targeting Field Programmable Gate Array
devices (FPGAs), as they are becoming widely used for
sensitive applications involving cryptography.\par
We show experimentally that ElectroMagnetic Analysis
(EMA) is always faster than the historical Differential
Power Analysis (DPA) in retrieving keys of symmetric
ciphers. In addition, these analyses prove to be very
convenient to conduct, as they are totally
non-invasive.\par
Research reports indicate that EMA can be conducted
globally, typically with macroscopic home-made coils
circling the device under attack, with fair results.
However, as accurate professional EM antennas are now
becoming more accessible, it has become commonplace to
carry out EM analyses locally.\par
Cartography has been carried out by optical means on
circuits realized with technology greater than 250
nanometers. Nonetheless, for deep submicron
technologies, the feature size of devices that are
spied upon is too small to be visible with photographic
techniques. In addition, the presence of the 6+
metallization layers obviously prevents a direct
observation of the layout. Therefore, EM imaging is
emerging as a relevant means to discover the underlying
device structure.\par
In this article, we present the first images of
deep-submicron FPGAs. The resolution is not as accurate
as photographic pictures: we notably compare the layout
of toy design examples placed at the four corners of
the FPGAs with the EM images we collected. We observe
that EM imaging has the advantage of revealing active
regions, which can be useful in locating a particular
processor (visible while active---invisible when
inactive).\par
In the context of EM attacks, we stress that the exact
localization of the cryptographic target is not
necessary: the coarse resolution we obtain is
sufficient. We note that the EM imaging does not reveal
the exact layout of the FPGA, but instead directly
guides the attacker towards the areas which are leaking
the most. We achieve attacks with an accurate sensor,
both far from (namely on a SMC capacitor on the board)
and close to (namely directly over the FPGA) the
encryption co-processor. As compared to the previously
published attacks, we report a successful attack on a
DES module in fewer than 6,300 measurements, which is
currently the best cracking performance against this
encryption algorithm implemented in FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "cartography; DPA; EMA; FPGA; SCA; security",
}
@Article{Majzoobi:2009:TDI,
author = "Mehrdad Majzoobi and Farinaz Koushanfar and Miodrag
Potkonjak",
title = "Techniques for Design and Implementation of Secure
Reconfigurable {PUFs}",
journal = j-TRETS,
volume = "2",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1502781.1502786",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:27 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Physically unclonable functions (PUFs) provide a basis
for many security and digital rights management
protocols. PUF-based security approaches have numerous
comparative strengths with respect to traditional
cryptography-based techniques, including resilience
against physical and side channel attacks and
suitability for lightweight protocols. However,
classical delay-based PUF structures have a number of
drawbacks including susceptibility to guessing, reverse
engineering, and emulation attacks, as well as
sensitivity to operational and environmental
variations.\par
To address these limitations, we have developed a new
set of techniques for FPGA-based PUF design and
implementation. We demonstrate how reconfigurability
can be exploited to eliminate the stated PUF
limitations. We also show how FPGA-based PUFs can be
used for privacy protection. Furthermore,
reconfigurability enables the introduction of new
techniques for PUF testing. The effectiveness of all
the proposed techniques is validated using extensive
implementations, simulations, and statistical
analysis.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "hardware security; physically unclonable functions;
process variation; Reconfigurable systems",
}
@Article{Dutt:2009:TBD,
author = "Shantanu Dutt and Li Li",
title = "Trust-Based Design and Check of {FPGA} Circuits Using
Two-Level Randomized {ECC} Structures",
journal = j-TRETS,
volume = "2",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1502781.1508209",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 1 18:15:27 MDT 2009",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A novel trust-based design method for FPGA circuits
that uses error-correcting code (ECC) structures for
detecting design tampers (changes, deletion of existing
logic, and addition of extradesign logic-like Trojans)
is proposed in this article. We determine ECC-based CLB
(configuration logic block) parity groups and embed the
check CLBs for each parity group in the FPGA circuit.
During a trust-checking phase, a Test-Pattern Generator
(TPG) and an Output Response Analyzer (ORA), configured
in the FPGA, are used to check that each parity group
of CLB outputs produce the expected parities. We use
two levels of randomization to thwart attempts by an
adversary to discover the parity groups and inject
tampers that mask each other, or to tamper with the TPG
and ORA so that design tampers remain undetected: (a)
randomization of the mapping of the ECC parity groups
to the CLB array; (b) randomization within each parity
group of odd and even parities for different input
combinations (classically, all ECC parity groups have
even parities across all inputs). These randomizations
along with the error-detecting property of the
underlying ECC lead to design tampers being uncovered
with very high probabilities, as we show both
analytically and empirically. We also classify
different CLB function structures and impose a parity
group selection in which only similarly structured
functions are randomly selected to be in the same
parity group in order to minimize check function
complexity. Using the 2D code as our underlying ECC and
its 2-level randomization, our experiments with
inserting 1-10 circuit CLB tampers and 1-5 extraneous
logic CLBs in two medium-size circuits and a RISC
processor circuit implemented on a Xilinx Spartan-3
FPGA show promising results of 100\% tamper detection
and 0\% false alarms, obtained at a hardware overhead
of only 7-10\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Error-correcting codes; FPGAs; masking probability;
parity groups; parity randomization; trust checking;
trust-based design",
}
@Article{Amano:2009:GEI,
author = "Hideharu Amano and Tadao Nakamura",
title = "Guest editors' introduction: {ICFPT 2007}",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534917",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhao:2009:TMB,
author = "Weisheng Zhao and Eric Belhaire and Claude Chappert
and Bernard Dieny and Guillaume Prenat",
title = "{TAS-MRAM}-Based Low-Power High-Speed Runtime
Reconfiguration {(RTR) FPGA}",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534918",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Koch:2009:HDT,
author = "Dirk Koch and Christian Beckhoff and J{\"u}rgen
Teich",
title = "Hardware Decompression Techniques for {FPGA}-Based
Embedded Systems",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534919",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wong:2009:SMC,
author = "Justin S. J. Wong and Pete Sedcole and Peter Y. K.
Cheung",
title = "Self-Measurement of Combinatorial Circuit Delays in
{FPGAs}",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534920",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Seetharaman:2009:ASF,
author = "G. Seetharaman and B. Venkataramani",
title = "Automation Schemes for {FPGA} Implementation of
Wave-Pipelined Circuits",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534921",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yu:2009:VPS,
author = "Jason Yu and Christopher Eagleston and Christopher
Han-Yu Chou and Maxime Perreault and Guy Lemieux",
title = "Vector Processing as a Soft Processor Accelerator",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534922",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cevrero:2009:FPC,
author = "Alessandro Cevrero and Panagiotis Athanasopoulos and
Hadi Parandeh-Afshar and Ajay K. Verma and Hosein Seyed
Attarzadeh Niaki and Chrysostomos Nicopoulos and Frank
K. Gurkaynak and Philip Brisk and Yusuf Leblebici and
Paolo Ienne",
title = "Field Programmable Compressor Trees: Acceleration of
Multi-Input Addition on {FPGAs}",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534923",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jang:2009:WFT,
author = "Stephen Jang and Billy Chan and Kevin Chung and Alan
Mishchenko",
title = "{WireMap}: {FPGA} Technology Mapping for Improved
Routability and Enhanced {LUT} Merging",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "14:1--14:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534924",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chung:2009:PTS,
author = "Eric S. Chung and Michael K. Papamichael and Eriko
Nurvitadhi and James C. Hoe and Ken Mai and Babak
Falsafi",
title = "{ProtoFlex}: Towards Scalable, Full-System
Multiprocessor Simulations Using {FPGAs}",
journal = j-TRETS,
volume = "2",
number = "2",
pages = "15:1--15:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1534916.1534925",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:50 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Pellauer:2009:PNP,
author = "Michael Pellauer and Muralidaran Vijayaraghavan and
Michael Adler and Arvind and Joel Emer",
title = "{A}-Port Networks: Preserving the Timed Behavior of
Synchronous Systems for Modeling on {FPGAs}",
journal = j-TRETS,
volume = "2",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575774.1575775",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:54 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cong:2009:FBH,
author = "Jason Cong and Yi Zou",
title = "{FPGA}-Based Hardware Acceleration of Lithographic
Aerial Image Simulation",
journal = j-TRETS,
volume = "2",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575774.1575776",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:54 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ahmed:2009:PTV,
author = "Taneem Ahmed and Paul D. Kundarewich and Jason H.
Anderson",
title = "Packing Techniques for {Virtex-5 FPGAs}",
journal = j-TRETS,
volume = "2",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575774.1575777",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:54 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Parandeh-Afshar:2009:FLC,
author = "Hadi Parandeh-Afshar and Philip Brisk and Paolo
Ienne",
title = "An {FPGA} Logic Cell and Carry Chain Configurable as a
6:2 or 7:2 Compressor",
journal = j-TRETS,
volume = "2",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575774.1575778",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:54 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Compton:2009:ISI,
author = "Katherine Compton and Roger Woods and Christos
Bouganis and Pedro Diniz",
title = "Introduction to the Special Issue {ARC'08}",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "20:1--20:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575780",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jin:2009:ERA,
author = "Qiwei Jin and David B. Thomas and Wayne Luk and
Benjamin Cope",
title = "Exploring Reconfigurable Architectures for Tree-Based
Option Pricing Models",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "21:1--21:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575781",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Angelopoulou:2009:RRT,
author = "Maria E. Angelopoulou and Christos-Savvas Bouganis and
Peter Y. K. Cheung and George A. Constantinides",
title = "Robust Real-Time Super-Resolution on {FPGA} and an
Application to Video Enhancement",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "22:1--22:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575782",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lo:2009:SOC,
author = "Chia-Tien Dan Lo and Yi-Gang Tai",
title = "Space Optimization on Counters for {FPGA}-Based {Perl}
Compatible Regular Expressions",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "23:1--23:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575783",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Vassiliadis:2009:ADF,
author = "Nikolaos Vassiliadis and George Theodoridis and
Spiridon Nikolaidis",
title = "An Application Development Framework for {ARISE}
Reconfigurable Processors",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "24:1--24:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575784",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dragomir:2009:OLU,
author = "Ozana Silvia Dragomir and Todor Stefanov and Koen
Bertels",
title = "Optimal Loop Unrolling and Shifting for Reconfigurable
Architectures",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "25:1--25:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575785",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Underwood:2009:SSL,
author = "Keith D. Underwood and K. Scott Hemmert and Craig D.
Ulmer",
title = "From Silicon to Science: The Long Road to Production
Reconfigurable Supercomputing",
journal = j-TRETS,
volume = "2",
number = "4",
pages = "26:1--26:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1575779.1575786",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:46:56 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Roldao:2010:HTF,
author = "Antonio Roldao and George A. Constantinides",
title = "A High Throughput {FPGA}-Based Floating Point
Conjugate Gradient Implementation for Dense Matrices",
journal = j-TRETS,
volume = "3",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1661438.1661439",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:47:03 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dubois:2010:SMV,
author = "David Dubois and Andrew Dubois and Thomas Boorman and
Carolyn Connor and Steve Poole",
title = "Sparse Matrix-Vector Multiplication on a
Reconfigurable Supercomputer with Application",
journal = j-TRETS,
volume = "3",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1661438.1661440",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:47:03 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Drimer:2010:DBP,
author = "Saar Drimer and Tim G{\"u}neysu and Christof Paar",
title = "{DSPs}, {BRAMs}, and a Pinch of Logic: Extended
Recipes for {AES} on {FPGAs}",
journal = j-TRETS,
volume = "3",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1661438.1661441",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:47:03 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Koh:2010:CMP,
author = "Shannon Koh and Oliver Diessel",
title = "Configuration Merging in Point-to-Point Networks for
Module-Based {FPGA} Reconfiguration",
journal = j-TRETS,
volume = "3",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1661438.1661442",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:47:03 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Curreri:2010:PAF,
author = "John Curreri and Seth Koehler and Alan D. George and
Brian Holland and Rafael Garcia",
title = "Performance Analysis Framework for High-Level Language
Applications in Reconfigurable Computing",
journal = j-TRETS,
volume = "3",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1661438.1661443",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 16 09:47:03 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bodily:2010:CSI,
author = "John Bodily and Brent Nelson and Zhaoyi Wei and
Dah-Jye Lee and Jeff Chase",
title = "A Comparison Study on Implementing Optical Flow and
Digital Communications on {FPGAs} and {GPUs}",
journal = j-TRETS,
volume = "3",
number = "2",
pages = "6:1--6:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1754386.1754387",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 22 16:00:33 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "FPGA devices have often found use as
higher-performance alternatives to programmable
processors for implementing computations. Applications
successfully implemented on FPGAs typically contain
high levels of parallelism and often use simple
statically scheduled control and modest arithmetic.
Recently introduced computing devices such as
coarse-grain reconfigurable arrays, multi-core
processors, and graphical processing units promise to
significantly change the computational landscape and
take advantage of many of the same application
characteristics that fit well on FPGAs. One real-time
computing task, optical flow, is difficult to apply in
robotic vision applications because of its high
computational and data rate requirements, and so is a
good candidate for implementation on FPGAs and other
custom computing architectures. This article reports on
a series of experiments mapping a collection of
different algorithms onto both an FPGA and a GPU. For
two different optical flow algorithms the GPU had
better performance, while for a set of digital comm
MIMO computations, they had similar performance. In all
cases the FPGA implementations required 10x the
development time. Finally, a discussion of the two
technology's characteristics is given to show they
achieve high performance in different ways.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Digital communications; FPGA; GPU; optical flow;
reconfigurable computing",
}
@Article{Papadopoulos:2010:TRM,
author = "Konstantinos Papadopoulos and Ioannis Papaefstathiou",
title = "{Titan-R}: a Multigigabit Reconfigurable Combined
Compression\slash Decompression Unit",
journal = j-TRETS,
volume = "3",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1754386.1754388",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 22 16:00:33 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Data compression techniques can alleviate bandwidth
problems in even multigigabit networks and are
especially useful when combined with encryption. This
article demonstrates a reconfigurable hardware
compressor/decompressor core, the Titan-R, which can
compress/decompress data streams at 8.5 Gb/sec, making
it the fastest reconfigurable such device ever
proposed; the presented full-duplex implementation
allows for fully symmetric compression and
decompression rates at 8.5 Gbps each. Its compression
algorithm is a variation of the most widely used and
efficient such scheme, the Lempel--Ziv (LZ) algorithm
that uses part of the previous input stream as the
dictionary. In order to support this high network
throughput, the Titan-R utilizes a very fine-grained
pipeline and takes advantage of the high bandwidth
provided by the distributed on-chip RAMs of
state-of-the-art FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "data compression; FPGA; hardware algorithms;
networking; parallel processing; reconfigurable
computing; Stream processing",
}
@Article{Badrignans:2010:SSA,
author = "Beno{\^\i}t Badrignans and David Champagne and Reouven
Elbaz and Catherine Gebotys and Lionel Torres",
title = "{SARFUM}: Security Architecture for Remote {FPGA}
Update and Monitoring",
journal = j-TRETS,
volume = "3",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1754386.1754389",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 22 16:00:33 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Remote update of hardware platforms or embedded
systems is a convenient service enabled by Field
Programmable Gate Array (FPGA)-based systems. This
service is often essential in applications like
space-based FPGA systems or set-top boxes. However,
having the source of the update be remote from the FPGA
system opens the door to a set of attacks that may
challenge the confidentiality and integrity of the FPGA
configuration, the bitstream. Existing schemes propose
to encrypt and authenticate the bitstream to thwart
these attacks. However, we show that they do not
prevent the replay of old bitstream versions, and thus
give adversaries an opportunity for downgrading the
system. In this article, we propose a new architecture
called\par
sarfum that, in addition to ensuring bitstream
confidentiality and integrity, precludes the replay of
old bitstreams. sarfum also includes a protocol for the
system designer to remotely monitor the running
configuration of the FPGA. Following our presentation
and analysis of the security protocols, we propose an
example of implementation with the CCM (Counter with
CBC-MAC) authenticated encryption standard. We also
evaluate the impact of our architecture on the
configuration time for different FPGA devices.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "authenticated encryption; bitstream security; FPGA;
replay attack; security protocol; system downgrade",
}
@Article{Yoo:2010:IRR,
author = "Sang-Kyung Yoo and Deniz Karakoyunlu and Berk Birand
and Berk Sunar",
title = "Improving the Robustness of Ring Oscillator {TRNGs}",
journal = j-TRETS,
volume = "3",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1754386.1754390",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 22 16:00:33 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A ring oscillator-based true-random number generator
design (Rings design) was introduced in Sunar et al.
[2007]. The design was rigorously analyzed under a
simple mathematical model and its performance
characteristics were established. In this article we
focus on the practical aspects of the Rings design on a
reconfigurable logic platform and determine their
implications on the earlier analysis framework. We make
recommendations for avoiding pitfalls in real-life
implementations by considering ring interaction,
transistor-level effects, narrow signal rejection,
transmission line attenuation, and sampler bias.
Furthermore, we present experimental results showing
that changing operating conditions such as the power
supply voltage or the operating temperature may affect
the output quality when the signal is subsampled.
Hence, an attacker may shift the operating point via a
simple noninvasive influence and easily bias the TRNG
output. Finally, we propose modifications to the design
which significantly improve its robustness against
attacks, alleviate implementation-related problems, and
simultaneously improve its area, throughput, and power
performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "cryptography; Oscillator rings; true random number
generators",
}
@Article{Huffmire:2010:SPR,
author = "Ted Huffmire and Timothy Levin and Thuy Nguyen and
Cynthia Irvine and Brett Brotherton and Gang Wang and
Timothy Sherwood and Ryan Kastner",
title = "Security Primitives for Reconfigurable Hardware-Based
Systems",
journal = j-TRETS,
volume = "3",
number = "2",
pages = "10:1--10:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1754386.1754391",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 22 16:00:33 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Computing systems designed using reconfigurable
hardware are increasingly composed using a number of
different Intellectual Property (IP) cores, which are
often provided by third-party vendors that may have
different levels of trust. Unlike traditional software
where hardware resources are mediated using an
operating system, IP cores have fine-grain control over
the underlying reconfigurable hardware. To address this
problem, the embedded systems community requires novel
security primitives that address the realities of
modern reconfigurable hardware. In this work, we
propose security primitives using ideas centered around
the notion of ``moats and drawbridges.'' The primitives
encompass four design properties: logical isolation,
interconnect traceability, secure reconfigurable
broadcast, and configuration scrubbing. Each of these
is a fundamental operation with easily understood
formal properties, yet they map cleanly and efficiently
to a wide variety of reconfigurable devices. We
carefully quantify the required overheads of the
security techniques on modern FPGA architectures across
a number of different applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Advanced Encryption Standard (AES); controlled
sharing; enforcement mechanisms; execution monitors;
Field Programmable Gate Arrays (FPGAs); hardware
security; isolation; memory protection; reference
monitors; security policies; security primitives;
separation; static analysis; Systems-on-a-Chip (SoCs)",
}
@Article{Hemmert:2010:FEF,
author = "K. Scott Hemmert and Keith D. Underwood",
title = "Fast, Efficient Floating-Point Adders and Multipliers
for {FPGAs}",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839481",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Floating-point applications are a growing trend in the
FPGA community. As such, it has become critical to
create floating-point units optimized for standard FPGA
technology. Unfortunately, the FPGA design space is
very different from the VLSI design space; thus,
optimizations for FPGAs can differ significantly from
optimizations for VLSI. In particular, the FPGA
environment constrains the design space such that only
limited parallelism can be effectively exploited to
reduce latency. Obtaining the right balances between
clock speed, latency, and area in FPGAs can be
particularly challenging. This article presents
implementation details for an IEEE-754 standard
floating-point adder and multiplier for FPGAs. The
designs presented here enable a Xilinx Virtex4 FPGA
(-11 speed grade) to achieve 270 MHz IEEE compliant
double precision floating-point performance with a
9-stage adder pipeline and 14-stage multiplier
pipeline. The area requirement is approximately 500
slices for the adder and under 750 slices for the
multiplier.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "floating point; FPGA; HPC; reconfigurable computing",
}
@Article{Sghaier:2010:IAT,
author = "Ahmad Sghaier and Shawki Areibi and Robert Dony",
title = "Implementation Approaches Trade-Offs for {WiMax OFDM}
Functions on Reconfigurable Platforms",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839482",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This work investigates several approaches for
implementing the OFDM functions of the fixed-WiMax
standard on reconfigurable platforms. In the first
phase, a custom RTL approach, using VHDL, is
investigated. The approach shows the capability of a
medium-size FPGA to accommodate the OFDM functions of a
fixed-WiMax transceiver with only 50\% occupation rate.
In the second phase, a high-level approach based on the
AccelDSP tool is used and compared to the custom RTL
approach. The approach presents an easy flow to
transfer MATLAB floating-point code into synthesizable
cores. The AccelDSP approach shows an area overhead of
10\%, while allowing early architectural exploration
and accelerating the design time by a factor of two.
However, the performance figure obtained is almost 1/4
of that obtained in the custom RTL approach. In the
third phase, the Tensilica Xtensa configurable
processor is targeted, which presents remarkable
figures in terms of power, area, and design time.
Comparing the three approaches indicates that the
custom RTL approach has the lead in terms of
performance. However, both the AccelDSP and the
Tensilica Xtensa approaches show fast design time and
early architectural exploration capability. In terms of
power, the obtained estimation results show that the
configurable Xtensa processor approach has the lead,
where approximately the total power consumed is about
12--15 times less than those results obtained by the
other two approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "AccelDSP; ASIP; custom RTL; FPGA; Tensilica; WiMax",
}
@Article{Smith:2010:AFA,
author = "Alastair M. Smith and George A. Constantinides and
Peter Y. K. Cheung",
title = "An Automated Flow for Arithmetic Component Generation
in Field-Programmable Gate Arrays",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839483",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "State-of-the-art configurable logic platforms, such as
Field-Programmable Gate Arrays (FPGAs), consist of a
heterogeneous mixture of different component types.
Compared to traditional homogeneous configurable
platforms, heterogeneity provides speed and density
advantages. This is due to the replacement of
inefficient programmable logic and routing with
specialized logic and fixed interconnect in components
such as memories, embedded processor units, and fused
arithmetic units. Given the increasing complexity of
these components, this article introduces a method to
automatically propose and explore the benefits of
different types of fused arithmetic units. The methods
are based on common subgraph extraction techniques,
meaning that it is possible to explore different
subcircuits that occur frequently across a set of
benchmarks. A quantitative analysis is performed of the
various fused arithmetic circuits identified by our
tool, which are then automatically synthesized to an
ASIC process, providing a study of the speed and area
benefits of the components. The results of this study
provide bounds on the performance of heterogeneous
FPGAs: by incorporating coarse-grain components which
match the specific needs of a set of benchmarks we show
that significant improvements in circuit speed and area
can be made.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "common subgraph; FPGA; reconfigurable logic",
}
@Article{Moscola:2010:HAR,
author = "James Moscola and Ron K. Cytron and Young H. Cho",
title = "Hardware-Accelerated {RNA} Secondary-Structure
Alignment",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839484",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The search for homologous RNA molecules---sequences of
RNA that might behave similarly due to similarity in
their physical (secondary) structure---is currently a
computationally intensive task. Moreover, RNA sequences
are populating genome databases at a pace unmatched by
gains in standard processor performance. While software
tools such as Infernal can efficiently find homologies
among RNA families and genome databases of modest size,
the continuous advent of new RNA families and the
explosive growth in volume of RNA sequences necessitate
a faster approach.\par
This work introduces two different architectures for
accelerating the task of finding homologous RNA
molecules in a genome database. The first architecture
takes advantage of the tree-like configuration of the
covariance models used to represent the consensus
secondary structure of an RNA family and converts it
directly into a highly-pipelined processing engine.
Results for this architecture show a 24$ \times $
speedup over Infernal when processing a small RNA
model. It is estimated that the architecture could
potentially offer several thousands of times speedup
over Infernal on larger models, provided that there are
sufficient hardware resources available.\par
The second architecture is introduced to address the
steep resource requirements of the first architecture.
It utilizes a uniform array of processing elements and
schedules all of the computations required to scan for
an RNA homolog onto those processing elements. The
estimated speedup for this architecture over the
Infernal software package ranges from just under 20$
\times $ to over 2,350$ \times $.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "Bioinformatics; RNA; secondary-structure alignment",
}
@Article{Ben-Asher:2010:RMC,
author = "Yosi Ben-Asher and Danny Meisler and Nadav Rotem",
title = "Reducing Memory Constraints in Modulo Scheduling
Synthesis for {FPGAs}",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839485",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In High-Level Synthesis (HLS), extracting parallelism
in order to create small and fast circuits is the main
advantage of HLS over software execution. Modulo
Scheduling (MS) is a technique in which a loop is
parallelized by overlapping different parts of
successive iterations. This ability to extract
parallelism makes MS an attractive synthesis technique
for loop acceleration. In this work we consider two
problems involved in the use of MS which are central
when targeting FPGAs. Current MS scheduling techniques
sacrifice execution times in order to meet resource and
delay constraints. Let ``ideal'' execution times be the
ones that could have been obtained by MS had we ignored
resource and delay constraints. Here we pose the
opposite problem, which is more suitable for HLS,
namely, how to reduce resource constraints without
sacrificing the ideal execution time. We focus on
reducing the number of memory ports used by the MS
synthesis, which we believe is a crucial resource for
HLS. In addition to reducing the number of memory ports
we consider the need to develop MS techniques that are
fast enough to allow interactive synthesis times and
repeated applications of the MS to explore different
possibilities of synthesizing the circuits. Current
solutions for MS synthesis that can handle memory
constraints are too slow to support interactive
synthesis. We formalize the problem of reducing the
number of parallel memory references in every row of
the kernel by a novel combinatorial setting. The
proposed technique is based on inserting dummy
operations in the kernel and by doing so, performing
modulo-shift operations such that the maximal number of
parallel memory references in a row is reduced.
Experimental results suggest improved execution times
for the synthesized circuit. The synthesis takes only a
few seconds even for large-size loops.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "FPGA; high-level synthesis; memory optimizations;
modulo-scheduling",
}
@Article{Wang:2010:VVP,
author = "Xiaojun Wang and Miriam Leeser",
title = "{VFloat}: a Variable Precision Fixed- and
Floating-Point Library for Reconfigurable Hardware",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839486",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Optimal reconfigurable hardware implementations may
require the use of arbitrary floating-point formats
that do not necessarily conform to IEEE specified
sizes. We present a variable precision floating-point
library (VFloat) that supports general floating-point
formats including IEEE standard formats. Most
previously published floating-point formats for use
with reconfigurable hardware are subsets of our format.
Custom datapaths with optimal bitwidths for each
operation can be built using the variable precision
hardware modules in the VFloat library, enabling a
higher level of parallelism. The VFloat library
includes three types of hardware modules for format
control, arithmetic operations, and conversions between
fixed-point and floating-point formats. The format
conversions allow for hybrid fixed- and floating-point
operations in a single design. This gives the designer
control over a large number of design possibilities
including format as well as number range within the
same application. In this article, we give an overview
of the components in the VFloat library and demonstrate
their use in an implementation of the K-means
clustering algorithm applied to multispectral satellite
images.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "clustering; floating-point; Reconfigurable hardware",
}
@Article{Purnaprajna:2010:RRM,
author = "Madhura Purnaprajna and Mario Porrmann and Ulrich
Rueckert and Michael Hussmann and Michael Thies and Uwe
Kastens",
title = "Runtime Reconfiguration of Multiprocessors Based on
Compile-Time Analysis",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839487",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In multiprocessors, performance improvement is
typically achieved by exploring parallelism with fixed
granularities, such as instruction-level, task-level,
or data-level parallelism. We introduce a new
reconfiguration mechanism that facilitates variations
in these granularities in order to optimize resource
utilization in addition to performance improvements.
Our reconfigurable multiprocessor QuadroCore combines
the advantages of reconfigurability and parallel
processing. In this article, a unified
hardware-software approach for the design of our
QuadroCore is presented. This design flow is enabled
via compiler-driven reconfiguration which matches
application-specific characteristics to a fixed set of
architectural variations. A special reconfiguration
mechanism has been developed that alters the
architecture within a single clock cycle.\par
The QuadroCore has been implemented on Xilinx XC2V6000
for functional validation and on UMC's 90nm standard
cell technology for performance estimation. A diverse
set of applications have been mapped onto the
reconfigurable multiprocessor to meet orthogonal
performance characteristics in terms of time and power.
Speedup measurements show a 2--11 times performance
increase in comparison to a single processor.
Additionally, the reconfiguration scheme has been
applied to save power in data-parallel applications.
Gate-level simulations have been performed to measure
the power-performance trade-offs for two
computationally complex applications. The power reports
confirm that introducing this scheme of reconfiguration
results in power savings in the range of 15--24\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "compilation for multiprocessors; Reconfigurable
multiprocessors",
}
@Article{Banerjee:2010:BMA,
author = "Sudarshan Banerjee and Elaheh Bozorgzadeh and Juanjo
Noguera and Nikil Dutt",
title = "Bandwidth Management in Application Mapping for
Dynamically Reconfigurable Architectures",
journal = j-TRETS,
volume = "3",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839480.1839488",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 8 18:26:34 MDT 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Partial dynamic reconfiguration (often referred to as
partial RTR) enables true on-demand computing. In an
on-demand computing environment, a dynamically invoked
application is assigned resources such as data
bandwidth, configurable logic. The limited logic
resources are customized during application execution
by exploiting partial RTR. In this article, we propose
an approach that maximizes application performance when
available bandwidth and logic resources are limited.
Our proposed approach is based on theoretical
principles of minimizing application schedule length
under bandwidth and logic resource constraints. It
includes detailed microarchitectural considerations on
a commercially popular reconfigurable device, and it
exploits partial RTR very effectively by utilizing
data-parallelism property of common image-processing
applications. We present extensive application case
studies on a cycle-accurate simulation platform that
includes detailed resource considerations of the Xilinx
Virtex XC2V3000. Our experimental results demonstrate
that applying our proposed approach to common
image-filtering applications leads to 15--20\%
performance gain in scenarios with limited bandwidth,
when compared to prior work that also exploits
data-parallelism with RTR but includes simpler
bandwidth considerations. Last but not the least, we
also demonstrate how our proposed theoretical
principles can be directly applied to solve related
problems such as minimizing schedule length under logic
resource and power constraints.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
keywords = "bandwidth; Partial RTR; scheduling",
}
@Article{Williams:2010:CFR,
author = "Jason Williams and Chris Massie and Alan D. George and
Justin Richardson and Kunal Gosrani and Herman Lam",
title = "Characterization of Fixed and Reconfigurable
Multi-Core Devices for Application Acceleration",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "19:1--19:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862649",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Huang:2010:RCA,
author = "Miaoqing Huang and Vikram K. Narayana and Harald
Simmler and Olivier Serres and Tarek El-Ghazawi",
title = "Reconfiguration and Communication-Aware Task
Scheduling for High-Performance Reconfigurable
Computing",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "20:1--20:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862650",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sano:2010:FAB,
author = "Kentaro Sano and Wang Luzhou and Yoshiaki Hatsuda and
Takanori Iizuka and Satoru Yamamoto",
title = "{FPGA}-Array with Bandwidth-Reduction Mechanism for
Scalable and Power-Efficient Numerical Simulations
Based on Finite Difference Methods",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "21:1--21:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862651",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Saldana:2010:MPM,
author = "Manuel Salda{\~n}a and Arun Patel and Christopher
Madill and Daniel Nunes and Danyao Wang and Paul Chow
and Ralph Wittig and Henry Styles and Andrew Putnam",
title = "{MPI} as a Programming Model for High-Performance
Reconfigurable Computers",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "22:1--22:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862652",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chiu:2010:MDS,
author = "Matt Chiu and Martin C. Herbordt",
title = "Molecular Dynamics Simulations on High-Performance
Reconfigurable Computing Systems",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "23:1--23:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862653",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Montone:2010:PFD,
author = "Alessio Montone and Marco D. Santambrogio and
Donatella Sciuto and Seda Ogrenci Memik",
title = "Placement and Floorplanning in Dynamically
Reconfigurable {FPGAs}",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "24:1--24:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862654",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Reardon:2010:SFR,
author = "Casey Reardon and Eric Grobelny and Alan D. George and
Gongyu Wang",
title = "A Simulation Framework for Rapid Analysis of
Reconfigurable Computing Systems",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "25:1--25:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862655",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tian:2010:HPQ,
author = "Xiang Tian and Khaled Benkrid",
title = "High-Performance Quasi-{Monte Carlo} Financial
Simulation: {FPGA} vs. {GPP} vs. {GPU}",
journal = j-TRETS,
volume = "3",
number = "4",
pages = "26:1--26:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1862648.1862656",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 23 11:26:33 MST 2010",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Woods:2010:GEA,
author = "Roger Woods and J{\"u}rgen Becker and Peter Athanas
and Fearghal Morgan",
title = "Guest Editorial {ARC 2009}",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "1:1--1:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857928",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Saiprasert:2010:OHA,
author = "Chalermpol Saiprasert and Christos-S. Bouganis and
George A. Constantinides",
title = "An Optimized Hardware Architecture of a Multivariate
{Gaussian} Random Number Generator",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "2:1--2:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857929",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Monte Carlo simulation is one of the most widely used
techniques for computationally intensive simulations in
mathematical analysis and modeling. A multivariate
Gaussian random number generator is one of the main
building blocks of such a system. Field Programmable
Gate Arrays (FPGAs) are gaining increased popularity as
an alternative means to the traditional general purpose
processors targeting the acceleration of the
computationally expensive random number generator
block. This article presents a novel approach for
mapping a multivariate Gaussian random number generator
onto an FPGA by optimizing the computational path in
terms of hardware resource usage subject to an
acceptable error in the approximation of the
distribution of interest. The proposed approach is
based on the eigenvalue decomposition algorithm which
leads to a design with different precision requirements
in the computational paths. An analysis on the impact
of the error due to truncation/rounding operation along
the computational path is performed and an analytical
expression of the error inserted into the system is
presented. Based on the error analysis, three
algorithms that optimize the resource utilization and
at the same time minimize the error in the output of
the system are presented and compared. Experimental
results reveal that the hardware resource usage on an
FPGA as well as the error in the approximation of the
distribution of interest are significantly reduced by
the use of the optimization techniques introduced in
the proposed approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kahoul:2010:EHA,
author = "Asma Kahoul and Alastair M. Smith and George A.
Constantinides and Peter Y. K. Cheung",
title = "Efficient Heterogeneous Architecture Floorplan
Optimization using Analytical Methods",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "3:1--3:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857930",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kepa:2010:DAS,
author = "K. Kepa and F. Morgan and K. Ko{\'s}ciuszkiewicz and
L. Braun and M. H{\"u}bner and J. Becker",
title = "Design Assurance Strategy and Toolset for Partially
Reconfigurable {FPGA} Systems",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "4:1--4:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857931",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Inoue:2010:VGL,
author = "Kazuki Inoue and Qian Zhao and Yasuhiro Okamoto and
Hiroki Yosho and Motoki Amagasaki and Masahiro Iida and
Toshinori Sueyoshi",
title = "A Variable-Grain Logic Cell and Routing Architecture
for a Reconfigurable {IP} Core",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "5:1--5:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857932",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Guo:2010:OSC,
author = "Xu Guo and Patrick Schaumont",
title = "Optimized System-on-Chip Integration of a Programmable
{ECC} Coprocessor",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "6:1--6:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857933",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sterpone:2010:NTD,
author = "Luca Sterpone",
title = "A New Timing Driven Placement Algorithm for Dependable
Circuits on {SRAM}-based {FPGAs}",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "7:1--7:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857934",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lanuzza:2010:ESR,
author = "M. Lanuzza and P. Zicari and F. Frustaci and S. Perri
and P. Corsonello",
title = "Exploiting Self-Reconfiguration Capability to Improve
{SRAM}-based {FPGA} Robustness in Space and Avionics
Applications",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "8:1--8:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857935",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hsiung:2010:SPH,
author = "Pao-Ann Hsiung and Chun-Hsian Huang and Jih-Sheng Shen
and Chen-Chi Chiang",
title = "Scheduling and Placement of Hardware\slash Software
Real-Time Relocatable Tasks in Dynamically Partially
Reconfigurable Systems",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "9:1--9:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857936",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kanazawa:2010:ASL,
author = "Kenji Kanazawa and Tsutomu Maruyama",
title = "An Approach for Solving Large {SAT} Problems on
{FPGA}",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "10:1--10:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857937",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lu:2010:ERD,
author = "Yingxi Lu and Maire O'Neill and John McCanny",
title = "Evaluation of Random Delay Insertion against {DPA} on
{FPGAs}",
journal = j-TRETS,
volume = "4",
number = "1",
pages = "11:1--11:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1857927.1857938",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Jan 26 14:58:50 MST 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bergeron:2011:LTF,
author = "Etienne Bergeron and Louis-David Perron and Marc
Feeley and Jean Pierre David",
title = "Logarithmic-Time {FPGA} Bitstream Analysis: a Step
Towards {JIT} Hardware Compilation",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968503",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Vaidya:2011:NMC,
author = "Pranav Vaidya and Jaehwan John Lee",
title = "A Novel Multicontext Coarse-Grained Reconfigurable
Architecture {(CGRA)} For Accelerating Column-Oriented
Databases",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968504",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{ONeill:2011:SPM,
author = "Shane O'Neill and Roger Francis Woods and Alan James
Marshall and Qi Zhang",
title = "A Scalable and Programmable Modular Traffic Manager
Architecture",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968505",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nakajima:2011:FOR,
author = "Mao Nakajima and Minoru Watanabe",
title = "Fast Optical Reconfiguration of a Nine-Context {DORGA}
Using a Speed Adjustment Control",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "15:1--15:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968506",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tai:2011:POA,
author = "Tzu-Chiang Tai and Yen-Tai Lai",
title = "A Performance-Oriented Algorithm with Consideration on
Communication Cost for Dynamically Reconfigurable
{FPGA} Partitioning",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "16:1--16:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968507",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Demertzi:2011:DSO,
author = "Melina Demertzi and Pedro C. Diniz and Mary W. Hall
and Anna C. Gilbert and Yi Wang",
title = "Domain-Specific Optimization of Signal Recognition
Targeting {FPGAs}",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "17:1--17:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968508",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Galuzzi:2011:ISE,
author = "Carlo Galuzzi and Koen Bertels",
title = "The Instruction-Set Extension Problem: a Survey",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "18:1--18:28",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968509",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rupnow:2011:SAD,
author = "Kyle Rupnow and Keith D. Underwood and Katherine
Compton",
title = "Scientific Application Demands on a Reconfigurable
Functional Unit Interface",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "19:1--19:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968510",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kaganov:2011:FAM,
author = "Alexander Kaganov and Asif Lakhany and Paul Chow",
title = "{FPGA} Acceleration of {MultiFactor CDO} Pricing",
journal = j-TRETS,
volume = "4",
number = "2",
pages = "20:1--20:??",
month = may,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1968502.1968511",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 7 18:34:54 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Labrecque:2011:ASS,
author = "Martin Labrecque and Mark C. Jeffrey and J. Gregory
Steffan",
title = "Application-specific signatures for transactional
memory in soft processors",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "21:1--21:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000833",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Boland:2011:OMB,
author = "David Boland and George A. Constantinides",
title = "Optimizing memory bandwidth use and performance for
matrix-vector multiplication in iterative methods",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "22:1--22:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000834",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Glaser:2011:TFT,
author = "Johann Glaser and Markus Damm and Jan Haase and
Christoph Grimm",
title = "{TR-FSM}: Transition-Based reconfigurable finite state
machine",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "23:1--23:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000835",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Parvez:2011:ASF,
author = "Husain Parvez and Zied Marrakchi and Alp Kilic and
Habib Mehrez",
title = "Application-Specific {FPGA} using heterogeneous logic
blocks",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "24:1--24:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000836",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yan:2011:FBA,
author = "Jing Yan and Ning-Yi Xu and Xiong-Fei Cai and Rui Gao
and Yu Wang and Rong Luo and Feng-Hsiung Hsu",
title = "An {FPGA}-based accelerator for {LambdaRank} in {Web}
search engines",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "25:1--25:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000837",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In modern Web search engines, Neural Network
(NN)-based learning to rank algorithms is intensively
used to increase the quality of search results.
LambdaRank is one such algorithm. However, it is hard
to be efficiently accelerated by computer clusters or
GPUs, because: (i) the cost function for the ranking
problem is much more complex than that of traditional
Back-Propagation(BP) NNs, and (ii) no coarse-grained
parallelism exists in the algorithm. This article
presents an FPGA-based accelerator solution to provide
high computing performance with low power consumption.
A compact deep pipeline is proposed to handle the
complex computing in the batch updating. The area
scales linearly with the number of hidden nodes in the
algorithm. We also carefully design a data format to
enable streaming consumption of the training data from
the host computer. The accelerator shows up to 15.3X
(with PCIe x4) and 23.9X (with PCIe x8) speedup
compared with the pure software implementation on
datasets from a commercial search engine.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Aggarwal:2011:SMP,
author = "Vikas Aggarwal and Alan D. George and Changil Yoon and
Kishore Yalamanchili and Herman Lam",
title = "{SHMEM+}: a multilevel-{PGAS} programming model for
reconfigurable supercomputing",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "26:1--26:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000838",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Holland:2011:AMM,
author = "Brian Holland and Alan D. George and Herman Lam and
Melissa C. Smith",
title = "An analytical model for multilevel performance
prediction of Multi-{FPGA} systems",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "27:1--27:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000839",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shannon:2011:LRH,
author = "Lesley Shannon and Paul Chow",
title = "Leveraging reconfigurability in the hardware\slash
software codesign process",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "28:1--28:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000840",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "28",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nava:2011:ADR,
author = "Federico Nava and Donatella Sciuto and Marco Domenico
Santambrogio and Stefan Herbrechtsmeier and Mario
Porrmann and Ulf Witkowski and Ulrich Rueckert",
title = "Applying dynamic reconfiguration in the mobile
robotics domain: a case study on computer vision
algorithms",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "29:1--29:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000841",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "29",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Koehler:2011:PAB,
author = "Seth Koehler and Greg Stitt and Alan D. George",
title = "Platform-aware bottleneck detection for reconfigurable
computing applications",
journal = j-TRETS,
volume = "4",
number = "3",
pages = "30:1--30:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000832.2000842",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Aug 30 08:13:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "30",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cheung:2011:ISS,
author = "Peter Y. K. Cheung",
title = "Introduction to special section {FPGA 2009}",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "31:1--31:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068717",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "31",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Luu:2011:VFC,
author = "Jason Luu and Ian Kuon and Peter Jamieson and Ted
Campbell and Andy Ye and Wei Mark Fang and Kenneth Kent
and Jonathan Rose",
title = "{VPR 5.0}: {FPGA CAD} and architecture exploration
tools with single-driver routing, heterogeneity and
process scaling",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "32:1--32:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068718",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The VPR toolset has been widely used in FPGA
architecture and CAD research, but has not evolved over
the past decade. This article describes and illustrates
the use of a new version of the toolset that includes
four new features: first, it supports a broad range of
single-driver routing architectures, which have
superior architectural and electrical properties over
the prior multidriver approach (and which is now
employed in the majority of FPGAs sold). Second, it can
now model, for placement and routing a heterogeneous
selection of hard logic blocks. This is a key (but not
final) step toward the incluion of blocks such as
memory and multipliers. Third, we provide optimized
electrical models for a wide range of architectures in
different process technologies, including a range of
area-delay trade-offs for each single architecture.
Finally, to maintain robustness and support future
development the release includes a set of regression
tests for the software. To illustrate the use of the
new features, we explore several architectural issues:
the FPGA area efficiency versus logic block
granularity, the effect of single-driver routing, and a
simple use of the heterogeneity to explore the impact
of hard multipliers on wiring track count.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "32",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rubin:2011:CYO,
author = "Raphael Rubin and Andr{\'e} Dehon",
title = "Choose-your-own-adventure routing: Lightweight
load-time defect avoidance",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "33:1--33:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068719",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Aggressive scaling increases the number of devices we
can integrate per square millimeter but makes it
increasingly difficult to guarantee that each device
fabricated has the intended operational
characteristics. Without careful mitigation, component
yield rates will fall, potentially negating the
economic benefits of scaling. The fine-grained
reconfigurability inherent in FPGAs is a powerful tool
that can allow us to drop the stringent requirement
that every device be fabricated perfectly in order for
a component to be useful. To exploit inherent FPGA
reconfigurability while avoiding full CAD mapping, we
propose lightweight techniques compatible with the
current single bitstream model that can avoid defective
devices, reducing yield loss at high defect rates. In
particular, by embedding testing operations and
alternative path configurations into the bitstream,
each FPGA can avoid defects by making only simple,
greedy decisions at bitstream load time. With 20\%
additional tracks above the minimum routable channel
width, routes can tolerate 0.01\% switch and wire
defect rates, raising yield from essentially 0\% to
near 100\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "33",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mishchenko:2011:SDC,
author = "Alan Mishchenko and Robert Brayton and Jie-Hong R.
Jiang and Stephen Jang",
title = "Scalable don't-care-based logic optimization and
resynthesis",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "34:1--34:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068720",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We describe an optimization method for combinational
and sequential logic networks, with emphasis on
scalability. The proposed resynthesis (a) is capable of
substantial logic restructuring, (b) is customizable to
solve a variety of optimization tasks, and (c) has
reasonable runtime on industrial designs. The approach
uses don't-cares computed for a window surrounding a
node and can take into account external don't-cares
(e.g., unreachable states). It uses a SAT solver for
all aspects of Boolean manipulation: computing
don't-cares for a node in the window, and deriving a
new Boolean function of the node after resubstitution.
Experimental results on 6-input LUT networks after a
high effort synthesis show substantial reductions in
area and delay. When applied to 20 large academic
benchmarks, the LUT counts and logic levels are reduced
by 45.0\% and 12.2\%, respectively. The longest runtime
for synthesis and mapping is about two minutes. When
applied to a set of 14 industrial benchmarks ranging up
to 83K 6-LUTs, the LUT counts and logic levels are
reduced by 11.8\% and 16.5\%, respectively. The longest
runtime is about 30 minutes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "34",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kennings:2011:FTM,
author = "Andrew Kennings and Kristofer Vorwerk and Arun Kundu
and Val Pevzner and Andy Fox",
title = "{FPGA} technology mapping with encoded libraries and
staged priority cuts",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "35:1--35:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068721",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Technology mapping is an important step in the FPGA
CAD flow in which a network of simple gates is
converted into a network of logic blocks. This article
considers enhancements to a traditional LUT-based
mapping algorithm for an FPGA comprised of logic blocks
which implement only a subset of functions of up to k
variables; specifically, the logic block is a partial
LUT, but it possesses more inputs than a typical LUT.
An analysis of the logic block is presented, and
techniques for postmapping area recovery and
timing-driven buffer insertion are also described.
Numerical results are put forth which substantiate the
efficacy of the proposed methods using real circuits
mapped to a commercial FPGA architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "35",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Papadimitriou:2011:PPR,
author = "Kyprianos Papadimitriou and Apostolos Dollas and Scott
Hauck",
title = "Performance of partial reconfiguration in {FPGA}
systems: a survey and a cost model",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068722",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Fine-grain reconfigurable devices suffer from the time
needed to load the configuration bitstream. Even for
small bitstreams in partially reconfigurable FPGAs this
time cannot be neglected. In this article we survey the
performance of the factors that contribute to the
reconfiguration speed. Then, we study an FPGA-based
system architecture and with real experiments we
produce a cost model of Partial Reconfiguration (PR).
This model is introduced to calculate the expected
reconfiguration time and throughput. In order to
develop a realistic model we take into account all the
physical components that participate in the
reconfiguration process. We analyze the parameters that
affect the generality of the model and the adjustments
needed per system for error-free evaluation. We verify
it with real measurements, and then we employ it to
evaluate existing systems presented in previous
publications. The percentage error of the cost model
when comparing its results with the actual values of
those publications varies from 36\% to 63\%, whereas
existing works report differences up to two orders of
magnitude. Present work enables a user to evaluate PR
and decide whether it is suitable for a certain
application prior entering the complex PR design
flow.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "36",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2011:EDL,
author = "Xiaoheng Chen and Venkatesh Akella",
title = "Exploiting data-level parallelism for energy-efficient
implementation of {LDPC} decoders and {DCT} on an
{FPGA}",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "37:1--37:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068723",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We explore the use of Data-Level Parallelism (DLP) as
a way of improving the energy efficiency and power
consumption involved in running applications on an
FPGA. We show that static power consumption is a
significant fraction of the overall power consumption
in an FPGA and that it does not change significantly
even as the area required by an architecture increases,
because of the dominance of interconnect in an FPGA. We
show that the degree of DLP can be used in conjunction
with frequency scaling to reduce the overall power
consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "37",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Easwaran:2011:NLB,
author = "Lakshmi Easwaran and Ali Akoglu",
title = "Net-length-based routability-driven power-aware
clustering",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "38:1--38:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068724",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The state-of-the-art power-aware clustering tool,
P-T-VPack, achieves energy reduction by localizing nets
with high switching activity at the expense of channel
width and area. In this study, we employ predicted
individual postplacement net length information during
clustering and prioritize longer nets. This approach
targets the capacitance factor for energy reduction,
and prioritizes longer nets for channel width and area
reduction. We first introduce a new clustering
strategy, W-T-VPack, which replaces the switching
activity in P-T-VPack with a net length factor. We
obtain a 9.87\% energy reduction over T-VPack (3.78\%
increase over P-T-VPack), while at the same time
completely eliminating P-T-VPack's channel width and
area overhead. We then introduce W-P-T-VPack, which
combines switching activity and net length factors.
W-P-T-VPack achieves 14.26\% energy reduction (0.31\%
increase over P-T-VPack), while further improving
channel width by up to 12.87\% for different cluster
sizes. We investigate the energy performance of
routability (channel width)-driven clustering
algorithms, and show that W-T-VPack consistently
outperforms T-RPack and iRAC by at least 11.23\% and
9.07\%, respectively. We conclude that net-length-based
clustering is an effective method to concurrently
target energy and channel width.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "38",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Parandeh-Afshar:2011:CTS,
author = "Hadi Parandeh-Afshar and Arkosnato Neogy and Philip
Brisk and Paolo Ienne",
title = "Compressor tree synthesis on commercial
high-performance {FPGAs}",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068725",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Compressor trees are a class of circuits that
generalizes multioperand addition and the partial
product reduction trees of parallel multipliers using
carry-save arithmetic. Compressor trees naturally occur
in many DSP applications, such as FIR filters, and, in
the more general case, their use can be maximized
through the application of high-level transformations
to arithmetically intensive data flow graphs. Due to
the presence of carry-chains, it has long been thought
that trees of 2- or 3-input carry-propagate adders are
more efficient than compressor trees for FPGA
synthesis; however, this is not the case. This article
presents a heuristic for FPGA synthesis of compressor
trees that outperforms adder trees and exploits
carry-chains when possible. The experimental results
show that, on average, the use of compressor trees can
reduce critical path delay by 33\% and 45\%
respectively, compared to adder trees synthesized on
the Xilinx Virtex-5 and Altera Stratix III FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "39",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Inoue:2011:TCD,
author = "Hiroaki Inoue and Junya Yamada and Hideyuki Yoneda and
Katsumi Togawa and Masato Motomura and Koichiro
Furuta",
title = "Test compression for dynamically reconfigurable
processors",
journal = j-TRETS,
volume = "4",
number = "4",
pages = "40:1--40:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2068716.2068726",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Mar 16 16:20:35 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We present the world's first test compression
technique that features automation of compression rules
for test time reduction on dynamically reconfigurable
processors. Evaluations on an actual 40-nm product show
that our technique achieves a 2.7 times compression
ratio for original configuration information (better
than does GZIP), the peak decompression bandwidth of
1.6 GB/s, and 2.7 times shorter test times.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "40",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zick:2012:LCS,
author = "Kenneth M. Zick and John P. Hayes",
title = "Low-cost sensing with ring oscillator arrays for
healthier reconfigurable systems",
journal = j-TRETS,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133352.2133353",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 20 12:12:48 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Electronic systems on a chip increasingly suffer from
component variation, voltage noise, thermal hotspots,
and other subtle physical phenomena. Systems with
reconfigurability have unique opportunities for
adapting to such effects. Required, however, are
low-cost, fine-grained methods for sensing physical
parameters. This article presents powerful, novel
approaches to online sensing, including methods for
designing compact reconfigurable sensors, low-cost
threshold detection, and several enhanced measurement
procedures. Together, the approaches help enable
systems to autonomously uncover a wealth of physical
information. A highly efficient counter and improved
ring oscillator are introduced, enabling an entire
sensor node in just 8 Virtex-5 LUTs. We describe how
variations can be measured in delay, temperature,
switching-induced IR drop, and leakage-induced IR drop.
We demonstrate the proposed approach with an
experimental system based on a Virtex-5, instrumented
with over 100 sensors at an overhead of only 1.3\%.
Results from thermally controlled experiments provide
some surprising insights and illustrate the utility of
the approach. Online sensing can help open the door to
physically adaptive computing, including fine-grained
power, reliability, and health management schemes for
systems on a chip.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Michail:2012:EHT,
author = "Harris E. Michail and George S. Athanasiou and Vasilis
Kelefouras and George Theodoridis and Costas E.
Goutis",
title = "On the exploitation of a high-throughput {SHA-256
FPGA} design for {HMAC}",
journal = j-TRETS,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133352.2133354",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 20 12:12:48 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "High-throughput and area-efficient designs of hash
functions and corresponding mechanisms for Message
Authentication Codes (MACs) are in high demand due to
new security protocols that have arisen and call for
security services in every transmitted data packet. For
instance, IPv6 incorporates the IPSec protocol for
secure data transmission. However, the IPSec's
performance bottleneck is the HMAC mechanism which is
responsible for authenticating the transmitted data.
HMAC's performance bottleneck in its turn is the
underlying hash function. In this article a
high-throughput and small-size SHA-256 hash function
FPGA design and the corresponding HMAC FPGA design is
presented. Advanced optimization techniques have been
deployed leading to a SHA-256 hashing core which
performs more than 30\% better, compared to the next
better design. This improvement is achieved both in
terms of throughput as well as in terms of
throughput/area cost factor. It is the first reported
SHA-256 hashing core that exceeds 11Gbps (after place
and route in Xilinx Virtex 6 board).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Olivares:2012:RAV,
author = "Joaqu{\'\i}n Olivares",
title = "Reconfigurable architecture for {VBSME} with variable
pixel precision",
journal = j-TRETS,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133352.2133355",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 20 12:12:48 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Current video coding standards, e.g. MPEG-4 H.264/AVC,
include Variable Block Size Motion Estimation, in this
paper, this process is implemented by a reconfigurable
architecture based on Signed Digit arithmetic. Bit
serial computation is applied to reconfigure pixel
precision. The reconfigurable architectural model is
extremely simple to reconfigure. Pixel truncation is
used to speed up computation saving up 23.5\% of clock
cycles for 4-bit precision. This design allows to
process all motion vectors of a block in just one
iteration. This system has been implemented in FPGA,
and HDTVp results are presented. Main characteristics,
of this architecture are: very reduced cost, high
performance, and reconfigurable pixel precision, these
features could be useful in mobile devices.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Siozios:2012:NFE,
author = "Kostas Siozios and Vasilis F. Pavlidis and Dimitrios
Soudris",
title = "A novel framework for exploring {$3$-D} {FPGAs} with
heterogeneous interconnect fabric",
journal = j-TRETS,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133352.2133356",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 20 12:12:48 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A heterogeneous interconnect architecture can be a
useful approach for the design of 3-D FPGAs. A
methodology to investigate heterogeneous
interconnection schemes for 3-D FPGAs under different
3-D fabrication technologies is proposed. Application
of the proposed methodology on benchmark circuits
demonstrates an improvement in delay, power
consumption, and total wire-length of approximately
41\%, 32\%, and 36\%, respectively, as compared to 2-D
FPGAs. These improvements are additional to reducing
the number of interlayer connections. The fewer
interlayer connections are traded off for a higher
yield. An area model to evaluate this trade-off is
presented. Results indicate that a heterogeneous 3-D
FPGA requires 37\% less area as compared to a
homogeneous 3-D FPGA. Consequently, the heterogeneous
FPGAs can exhibit a higher manufacturing yield. A
design toolset is also developed to support the design
and exploration of various performance metrics for the
proposed 3-D FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Takano:2012:DAA,
author = "Shigeyuki Takano",
title = "Design and analysis of adaptive processor",
journal = j-TRETS,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133352.2133357",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 20 12:12:48 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A new computation model called CACHE (Cache
Architecture for Configurable Hardware Engine) is
proposed in this paper. This model does not require a
dedicated host processor and its software to harness
the reconfiguration. Autonomous reconfiguration is
performed within a working-set of application
datapaths. The CACHE model has lots of side effects;
caching, resource allocation and assignment, placement
and routing, and defragmentation, with a processing
array itself and a special register called a
working-set register file. The model aims to reduce
three major workloads: (1) the processor and
application design workload, (2) runtime resource
management and scheduling workload, and (3)
reconfiguration workload. In order to reduce these
workloads, processor architecture is definitely
different from traditional computing model and its
microprocessor architecture. There are three major
ideas to construct the computing system: (1) an on-chip
working-set model mainly in order to control load and
store of streams, namely to control traffics
introducing overheads, (2) an on-chip deadlock
properties model mainly in order to manage resources
and to continuously configure datapaths corresponding
to a working-set window, (3) a cache memory technique
to work for these models, the mechanism is equivalent
to the working-set window, and the cache memory's
procedure is equivalent to resource request,
acquirement, and release of deadlock properties. The
first model focuses onto streaming applications, for
example vector and matrix operations, filters, and so
on, which takes coarser grained operations such as
integer operations of C-language. Regarding performance
compared with DSPs, that comes from constant throughput
across different scale of the applications. In
addition, extended model, we call Instant model that
automatically generates instance of a datapath,
outperforms the DSPs. This paper shows its computation
model, architecture, low-level design, and analyses
about basic characteristics of the execution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2012:PSF,
author = "Wei Zhang and Vaughn Betz and Jonathan Rose",
title = "Portable and scalable {FPGA}-based acceleration of a
direct linear system solver",
journal = j-TRETS,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133352.2133358",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Mar 20 12:12:48 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "FPGAs have the potential to serve as a platform for
accelerating many computations including scientific
applications. However, the large development cost and
short life span for FPGA designs have limited their
adoption by the scientific computing community.
FPGA-based scientific computing and many kinds of
embedded computing could become more practical if there
were hardware libraries that were portable to any
FPGA-based system with performance that scaled with the
size of the FPGA. To illustrate this idea we have
implemented one common super-computing library
function: the LU factorization method for solving
systems of linear equations. This paper describes a
method for making the design both portable and scalable
that should be illustrative if such libraries are to be
built in the future. The design is a software-based
generator that leverages both the flexibility of a
software programming language and the parameters
inherent in an hardware description language. The
generator accepts parameters that describe the FPGA
capacity and external memory capabilities. We compare
the performance of our engine executing on the largest
FPGA available at the time of this work (an Altera
Stratix III 3S340) to a single processor core
fabricated in the same 65nm IC process running a highly
optimized software implementation from the processor
vendor. For single precision matrices on the order of $
10, 000 \times 10, 000 $ elements, the FPGA
implementation is 2.2 times faster and the energy
dissipated per useful GFLOP operation is a factor of 5
times less. For double precision, the FPGA
implementation is 1.7 times faster and 3.5 times more
energy efficient.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Aggarwal:2012:SFT,
author = "Vikas Aggarwal and Greg Stitt and Alan George and
Changil Yoon",
title = "{SCF}: a Framework for Task-Level Coordination in
Reconfigurable, Heterogeneous Systems",
journal = j-TRETS,
volume = "5",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2209285.2209286",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:43 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Heterogeneous computing systems comprised of
accelerators such as FPGAs, GPUs, and manycore
processors coupled with standard microprocessors are
becoming an increasingly popular solution for future
computing systems due to their higher performance and
energy efficiency. Although programming languages and
tools are evolving to simplify device-level design,
programming such systems is still difficult and
time-consuming largely due to system-wide challenges
involving communication between heterogeneous devices,
which currently require ad hoc solutions. Most
communication frameworks and APIs which have dominated
parallel application development for decades were
developed for homogeneous systems, and hence cannot be
directly employed for hybrid systems. To solve this
problem, this article presents the System Coordination
Framework (SCF), which employs message passing to
transparently enable communication between tasks
described using different programming tools (and
languages), and running on heterogeneous processing
devices of systems from domains ranging from embedded
systems to High-Performance Computing (HPC) systems. By
hiding low-level architectural details of the
underlying communication from an application designer,
SCF can improve application development productivity,
provide higher levels of application portability, and
offer rapid design-space exploration of different
task/device mappings. In addition, SCF enables custom
communication synthesis that exploits mechanisms
specific to different devices and platforms, which can
provide performance improvements over generic solutions
employed previously. Our results indicate a performance
improvement of 28$ \times $ and 682$ \times $ by
employing FPGA devices for two applications presented
in this article, while simultaneously improving the
developer productivity by approximately 2.5 to 5 times
by using SCF.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fekete:2012:DDR,
author = "S{\'a}ndor P. Fekete and Tom Kamphans and Nils Schweer
and Christopher Tessars and Jan C. van der Veen and
Josef Angermeier and Dirk Koch and J{\"u}rgen Teich",
title = "Dynamic Defragmentation of Reconfigurable Devices",
journal = j-TRETS,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2209285.2209287",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:43 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We propose a new method for defragmenting the module
layout of a reconfigurable device, enabled by a novel
approach for dealing with communication needs between
relocated modules and with inhomogeneities found in
commonly used FPGAs. Our method is based on dynamic
relocation of module positions during runtime, with
only very little reconfiguration overhead; the
objective is to maximize the length of contiguous free
space that is available for new modules. We describe a
number of algorithmic aspects of good defragmentation,
and present an optimization method based on tabu
search. Experimental results indicate that we can
improve the quality of module layout by roughly 50\%
over the static layout. Among other benefits, this
improvement avoids unnecessary rejections of modules.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cheng:2012:STP,
author = "Lerong Cheng and Wenyao Xu and Fang Gong and Yan Lin
and Ho-Yan Wong and Lei He",
title = "Statistical Timing and Power Optimization of
Architecture and Device for {FPGAs}",
journal = j-TRETS,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2209285.2209288",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:43 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Process variation in nanometer technology is becoming
an important issue for cutting-edge FPGAs with a
multimillion gate capacity. Considering both die-to-die
and within-die variations in effective channel length,
threshold voltage, and gate oxide thickness, we first
develop closed-form models of chip-level FPGA leakage
and timing variations. Experiments show that the mean
and standard deviation computed by our models are
within 3\% from those computed by Monte Carlo
simulation. We also observe that the leakage and timing
variations can be up to 3X and 1.9X, respectively. We
then derive analytical yield models considering both
leakage and timing variations, and use such models to
evaluate the performance of FPGA device and
architecture considering process variations. Compared
to the baseline, which uses the VPR architecture and
device setup based on the ITRS roadmap, device and
architecture tuning improves leakage yield by 10.4\%,
timing yield by 5.7\%, and leakage and timing combined
yield by 9.4\%. We also observe that LUT size of 4
gives the highest leakage yield, LUT size of 7 gives
the highest timing yield, but LUT size of 5 achieves
the maximum leakage and timing combined yield. To the
best of our knowledge, this is the first in-depth study
on FPGA architecture and device coevaluation
considering process variation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Martin:2012:CPA,
author = "Kevin Martin and Christophe Wolinski and Krzysztof
Kuchcinski and Antoine Floch and Fran{\c{c}}ois
Charot",
title = "Constraint Programming Approach to Reconfigurable
Processor Extension Generation and Application
Compilation",
journal = j-TRETS,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2209285.2209289",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:43 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we present a constraint programming
approach for solving hard design problems present when
automatically designing specialized processor
extensions. Specifically, we discuss our approach for
automatic selection and synthesis of processor
extensions as well as efficient application compilation
for these newly generated extensions. The discussed
approach is implemented in our integrated design
framework, IFPEC, built using Constraint Programming
(CP). In our framework, custom instructions,
implemented as processor extensions, are defined as
computational patterns and represented as graphs. This,
along with the graph representation of an application,
provides a way to use our CP framework equipped with
subgraph isomorphism and connected component
constraints for identification of processor extensions
as well as their selection, application scheduling,
binding, and routing. All design steps assume
architectures composed of runtime reconfigurable cells,
implementing selected extensions, tightly connected to
a processor. An advantage of our approach is the
possibility of combining different heterogeneous
constraints to represent and solve all our design
problems. Moreover, the flexibility and expressiveness
of the CP framework makes it possible to solve
simultaneously extension selection, application
scheduling, and binding and improve the quality of the
generated results. The article is largely illustrated
with experimental results.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hubner:2012:ISI,
author = "Michael H{\"u}bner",
title = "Introduction to the Special Issue on {ReCoSoC 2011}",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "11:1--11:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362375",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shield:2012:ACC,
author = "John Shield and Jean-Philippe Diguet and Guy Gogniat",
title = "Asymmetric Cache Coherency: Policy Modifications to
Improve Multicore Performance",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362376",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Asymmetric coherency is a new optimization method for
coherency policies to support nonuniform workloads in
multicore processors. Asymmetric coherency assists in
load balancing a workload and this is applicable to SoC
multicores where the applications are not evenly spread
among the processors and customization of the coherency
is possible. Asymmetric coherency is a policy change,
and consequently our designs require little or no
additional hardware over an existing system. We explore
two different types of asymmetric coherency policies.
Our bus-based asymmetric coherency policy, generated a
60\% coherency cost reduction (reduction of latencies
due to coherency messages) for nonshared data. Our
directory-based asymmetric coherency policy, showed up
to a 5.8\% execution time improvement and up to a 22\%
improvement in average memory latency for the parallel
benchmarks Sha, using a statically allocated asymmetry.
Dynamically allocated asymmetry was found to generate
further improvements in access latency, increasing the
effectiveness of asymmetric coherency by up to 73.8\%
when compared to the static asymmetric solution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Thielmann:2012:MLH,
author = "Benjamin Thielmann and Jens Huthmann and Andreas
Koch",
title = "Memory Latency Hiding by Load Value Speculation for
Reconfigurable Computers",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362377",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Load value speculation has long been proposed as a
method to hide the latency of memory accesses. It has
seen very limited use in actual processors, often due
to the high overhead of reexecuting misspeculated
computations. We present PreCoRe, a framework capable
of generating application-specific microarchitectures
supporting load value speculation on reconfigurable
computers. The article examines the lightweight
speculation and replay mechanisms, the architecture of
the actual data value prediction units as well as the
impact on the nonspeculative parts of the memory
system. In experiments, using PreCoRe has achieved
speedups of up to 2.48 times over nonspeculative
implementations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gantel:2012:ERP,
author = "Laurent Gantel and Amel Khiar and Benoit Miramond and
Mohamed El Amine Benkhelifa and Lounis Kessal and
Fabrice Lemonnier and Jimmy Le Rhun",
title = "Enhancing Reconfigurable Platforms Programmability for
Synchronous Data-Flow Applications",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362378",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Recent FPGAs allow the design of efficient and complex
Heterogeneous Systems-on-Chip (HSoC). Namely, these
systems are composed of several processors, hardware
accelerators as well as communication media between all
these components. Performances provided by HSoCs make
them really interesting for data-flow applications,
especially image processing applications. The use of
this kind of architecture provides good performances
but the drawback is an increase of the programming
complexity. This complexity is due to the heterogeneous
deployment of the application on the platform. Some
functions are implemented in software to run on a
processor, whereas other functions are implemented in
hardware to run in a reconfigurable partition of the
FPGA. This article aims to define a programming model
based on the Synchronous Data-Flow model, in order to
abstract the heterogeneity of the implementation and to
leverage the communication issue between software and
hardware actors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lusala:2012:STB,
author = "Angelo Kuti Lusala and Jean-Didier Legat",
title = "A {SDM--TDM}-Based Circuit-Switched Router for On-Chip
Networks",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "15:1--15:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362379",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article proposes a circuit-switched router that
combines Spatial Division Multiplexing (SDM) and Time
Division Multiplexing (TDM) in order to increase path
diversity in the router while sharing channels among
multiple connections. In this way, the probability of
establishing paths through the network is increased,
thereby significantly reducing contention in the
network. Furthermore, Quality of Service (QoS) is
easily guaranteed. The proposed router was synthesized
on an Stratix III 3SL340F FPGA device. A 4 $ \times $ 4
2D Mesh SDM-TDM Network-on-Chip (NoC) was built with
the proposed router and synthesized on the 3SL340F FPGA
device. The 4 $ \times $ 4 2D Mesh SDM-TDM NoC was used
to build on an FPGA device, a Multiprocessor
System-on-Chip (MPSoC) platform consisted of 16 Nios
II/f processors, 16 20-KB On-chip Memories, and 16
Network Interfaces. Synthesis results of the MPSoC
platform show that the proposed router architecture can
be used to built large practicable MPSoC platforms with
the proposed NoC architecture with a reasonable
hardware overhead and appreciable clock frequency.
Simulation results show that combining SDM and TDM
techniques in a router allows the highest probability
of establishing paths through the network.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gaspar:2012:SEF,
author = "Lubos Gaspar and Viktor Fischer and Lilian Bossuet and
Robert Fouquet",
title = "Secure Extension of {FPGA} General Purpose Processors
for Symmetric Key Cryptography with Partial
Reconfiguration Capabilities",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "16:1--16:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362380",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In data security systems, general purpose processors
(GPPs) are often extended by a cryptographic
accelerator. The article presents three ways of
extending GPPs for symmetric key cryptography
applications. Proposed extensions guarantee secure key
storage and management even if the system is facing
protocol, software and cache memory attacks. The system
is partitioned into processor, cipher, and key memory
zones. The three security zones are separated at
protocol, system, architecture and physical levels. The
proposed principle was validated on Altera NIOS II,
Xilinx MicroBlaze and Microsemi Cortex M1 soft-core
processor extensions. We show that stringent separation
of the cipher zone is helpful for partial
reconfiguration of the security module, if the
enciphering algorithm needs to be dynamically changed.
However, the key zone including reconfiguration
controller must remain static in order to maintain the
high level of security required. We demonstrate that
the principle is feasible in partially reconfigurable
field programmable gate arrays (FPGAs) such as Altera
Stratix V or Xilinx Virtex 6 and also to some extent in
FPGAs featuring hardwired general purpose processors
such as Cortex M3 in Microsemi SmartFusion FPGA.
Although the three GPPs feature different data
interfaces, we show that the processors with their
extensions reach the required high security level while
maintaining partial reconfiguration capability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ost:2012:EAT,
author = "Luciano Ost and Sameer Varyani and Leandro Soares
Indrusiak and Marcelo Mandelli and Gabriel Marchesan
Almeida and Eduardo Wachter and Fernando Moraes and
Gilles Sassatelli",
title = "Enabling Adaptive Techniques in Heterogeneous {MPSoCs}
Based on Virtualization",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "17:1--17:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362381",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article explores the use of virtualization to
enable mechanisms like task migration and dynamic
mapping in heterogeneous MPSoCs, thereby targeting the
design of systems capable of adapt their behavior to
time-changing workloads. Because tasks may have to be
mapped to target processors with different instruction
set architectures, we propose the use of Low Level
Virtual Machine (LLVM) to postcompile the tasks at
runtime depending on their target processor. A novel
dynamic mapping heuristic is also proposed, aiming to
exploit the advantages of specialized processors while
taking into account the overheads imposed by
virtualization. Extensive experimental work at
different levels of abstraction---FPGA prototype, RTL
and system-level simulation---is presented to evaluate
the proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Morgan:2012:RFL,
author = "Fearghal Morgan and Seamus Cawley and David Newell",
title = "Remote {FPGA} Lab for Enhancing Learning of Digital
Systems",
journal = j-TRETS,
volume = "5",
number = "3",
pages = "18:1--18:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2362374.2362382",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Nov 6 18:07:44 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Learning in digital systems can be enhanced through
applying a learn-by-doing approach on practical
hardware systems and by using Web-based technology to
visualize and animate hardware behavior. The authors
have reported the Web-based Remote FPGA Lab (RFL) which
provides a novel, real-time control and visualization
interface to a remote, always-on FPGA hardware
implementation. The RFL helps students to understand
and reason about digital systems operation, using
interactive animation of signal behavior in an
executing digital logic system, at any level of the
design hierarchy. The RFL supports the creation of
real-time interactive digital systems teaching demos.
The article presents student RFL usage data and survey
data which highlight improved student engagement,
learning and achievement. The article describes the RFL
architecture, communication interface, Web page
functionality, user access administration and database
management. The article also describes the RFLGen
program, developed to automate user design integration
into the Xilinx ISE VHDL-based RFL project wrapper for
creation of FPGA configuration bitstreams and RFL
animations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Krieg:2012:PMP,
author = "Armin Krieg and Johannes Grinschgl and Christian
Steger and Reinhold Weiss and Holger Bock and Josef
Haid",
title = "{POWER-MODES: POWer-EmulatoR- and MOdel-Based
DEpendability and Security Evaluations}",
journal = j-TRETS,
volume = "5",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2392616.2392617",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sun May 5 09:22:43 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Innovation cycles have been shortening significantly
during the last years. This process puts tremendous
pressure on designers of embedded systems for
security-or reliability-critical applications. Eventual
design problems not detected during design time can
lead to lost money, confidentiality, or even loss of
life in extreme cases. Therefore it is of vital
importance to evaluate a new system for its robustness
against intentionally and random induced operational
faults. Currently this is generally done using
extensive simulation runs using gate-level models or
direct measurements on the finished silicon product.
These approaches either need a significant amount of
time and computational power for these simulations or
rely on existing product samples. This article presents
a novel system evaluation platform using power
emulation and fault injection techniques to provide an
additional tool for developers of embedded systems in
security-and reliability-critical fields. Faults are
emulated using state-of-the-art fault injection methods
and a flexible pattern representation approach. The
resulting effects of these faults on the power
consumption profile are estimated using
state-of-the-art power emulation hardware. A modular
system augmentation approach provides emulation
flexibility similar to fault simulation
implementations. The platform enables the efficient
evaluation of new hardware or software implementations
of critical security or reliability solutions at an
early development phase.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nabina:2012:AVS,
author = "Atukem Nabina and Jose Luis Nunez-Yanez",
title = "Adaptive Voltage Scaling in a Dynamically
Reconfigurable {FPGA}-Based Platform",
journal = j-TRETS,
volume = "5",
number = "4",
pages = "20:1--20:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2392616.2392618",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sun May 5 09:22:43 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Power is an important issue limiting the applicability
of Field Programmable Gate Arrays (FPGAs) since it is
considered to be up to one order of magnitude higher
than in ASICs. Recently, dynamic reconfiguration in
FPGAs has emerged as a viable technique able to achieve
power and cost reductions by time-multiplexing the
required functionality at runtime. In this article, the
applicability of Adaptive Voltage Scaling (AVS) to
FPGAs is considered together with dynamic
reconfiguration of logic and clock management resources
to further improve the power profile of these devices.
AVS is a popular power-saving technique in ASICs that
enables a device to regulate its own voltage and
frequency based on workload, fabrication, and operating
conditions. The resulting processing platform exploits
the available application-dependent timing margins to
achieve a power reduction up to 85\% operating at 0.58
volts compared with operating at a nominal voltage of 1
volt. The results also show that the energy
requirements at 0.58 volts are approximately five times
lower compared with nominal voltage and this can be
explained by the approximate cubic relation of static
energy with voltage and the fact that the static
component dominates power consumption in the considered
FPGA devices.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jacobs:2012:RFT,
author = "Adam Jacobs and Grzegorz Cieslewski and Alan D. George
and Ann Gordon-Ross and Herman Lam",
title = "Reconfigurable Fault Tolerance: a Comprehensive
Framework for Reliable and Adaptive {FPGA}-Based Space
Computing",
journal = j-TRETS,
volume = "5",
number = "4",
pages = "21:1--21:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2392616.2392619",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sun May 5 09:22:43 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Commercial SRAM-based, field-programmable gate arrays
(FPGAs) have the potential to provide space
applications with the necessary performance to meet
next-generation mission requirements. However,
mitigating an FPGA's susceptibility to single-event
upset (SEU) radiation is challenging. Triple-modular
redundancy (TMR) techniques are traditionally used to
mitigate radiation effects, but TMR incurs substantial
overheads such as increased area and power
requirements. In order to reduce these overheads while
still providing sufficient radiation mitigation, we
propose a reconfigurable fault tolerance (RFT)
framework that enables system designers to dynamically
adjust a system's level of redundancy and fault
mitigation based on the varying radiation incurred at
different orbital positions. This framework includes an
adaptive hardware architecture that leverages FPGA
reconfigurable techniques to enable significant
processing to be performed efficiently and reliably
when environmental factors permit. To accurately
estimate upset rates, we propose an upset rate modeling
tool that captures time-varying radiation effects for
arbitrary satellite orbits using a collection of
existing, publicly available tools and models. We
perform fault-injection testing on a prototype RFT
platform to validate the RFT architecture and RFT
performability models. We combine our RFT hardware
architecture and the modeled upset rates using
phased-mission Markov modeling to estimate
performability gains achievable using our framework for
two case-study orbits.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cancare:2012:EHC,
author = "Fabio Cancare and Davide B. Bartolini and Matteo
Carminati and Donatella Sciuto and Marco D.
Santambrogio",
title = "On the Evolution of Hardware Circuits via
Reconfigurable Architectures",
journal = j-TRETS,
volume = "5",
number = "4",
pages = "22:1--22:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2392616.2392620",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sun May 5 09:22:43 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Traditionally, hardware circuits are realized
according to techniques that follow the classical
phases of design and testing. A completely new approach
in the creation of hardware circuits has been
proposed---the Evolvable Hardware (EHW) paradigm, which
bases the circuit synthesis on a goal-oriented
evolutionary process inspired by biological evolution
in Nature. FPGA-based approaches have emerged as the
main architectural solution to implement EHW systems.
Various EHW systems have been proposed by researchers
but most of them, being based on outdated chips, do not
take advantage of the interesting features introduced
in newer FPGAs. This article describes a project named
Hardware Evolution over Reconfigurable Architectures
(HERA), which aims at creating a complete and
performance-oriented framework for the evolution of
digital circuits, leveraging the reconfiguration
technology available in FPGAs. The project is described
from its birth to its current state, presenting its
evolutionary technique tailored for FPGA-based circuits
and the most recent enhancements to improve the
scalability with respect to problem size. The developed
EHW system outperforms the state of the art, proving
its effectiveness in evolving both standard benchmarks
and more complex real-world applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ould-Bachir:2013:SAS,
author = "Tarek Ould-Bachir and Jean Pierre David",
title = "Self-Alignment Schemes for the Implementation of
Addition-Related Floating-Point Operators",
journal = j-TRETS,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457443.2457444",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:42 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Advances in semiconductor technology brings to the
market incredibly dense devices, capable of handling
tens to hundreds floating-point operators on a single
chip; so do the latest field programmable gate arrays
(FPGAs). In order to alleviate the complexity of
resorting to these devices in computationally intensive
applications, this article proposes hardware schemes
for the realization of addition-related floating-point
operators based on the self-alignment technique (SAT).
The article demonstrates that the schemes guarantee an
accuracy as if summation was computed accurately in the
precision of operator's internal mantissa, then
faithfully rounded to working precision. To achieve
such performance, the article adopts the redundant high
radix carry-save (HRCS) format for the rapid addition
of wide mantissas. Implementation results show that
combining the SAT and the HRCS format allows the
implementation of complex operators with reduced area
and latency, more so when a fused-path approach is
adopted. The article also proposes a new hardware
operator for performing endomorphic HRCS additions and
presents a new technique for speeding up the conversion
from the redundant HRCS to a conventional binary
format.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2013:FBA,
author = "Yan Zhang and Fan Zhang and Zheming Jin and Jason D.
Bakos",
title = "An {FPGA-Based} Accelerator for Frequent Itemset
Mining",
journal = j-TRETS,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457443.2457445",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:42 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article we describe a Field Programmable Gate
Array (FPGA)-based coprocessor architecture for
Frequent Itemset Mining (FIM). FIM is a common data
mining task used to find frequently occurring subsets
amongst a database of sets. FIM is a nonnumerical, data
intensive computation and is used in machine learning
and computational biology. FIM is particularly
expensive---in terms of execution time and
memory---when performed on large and/or sparse
databases or when applied using a low appearance
frequency threshold. Because of this, the development
of increasingly efficient FIM algorithms and their
mapping to parallel architectures is an active field.
Previous attempts to accelerate FIM using FPGAs have
relied on performance-limiting strategies such as
iterative database loading and runtime logic unit
reconfiguration. In this article, we present a novel
architecture to implement Eclat, a well-known FIM
algorithm. Unlike previous efforts, our technique does
not impose limits on the maximum set size as a function
of available FPGA logic resources and our design scales
well to multiple FPGAs. In addition to a novel hardware
design, we also present a corresponding compression
scheme for intermediate results that are stored in
on-chip memory. On a four-FPGA board, experimental
results show up to 68X speedup compared to a highly
optimized software implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Meeuws:2013:QSM,
author = "Roel Meeuws and S. Arash Ostadzadeh and Carlo Galuzzi
and Vlad Mihai Sima and Razvan Nane and Koen Bertels",
title = "{Quipu}: a Statistical Model for Predicting Hardware
Resources",
journal = j-TRETS,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457443.2457446",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:42 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "There has been a steady increase in the utilization of
heterogeneous architectures to tackle the growing need
for computing performance and low-power systems. The
execution of computation-intensive functions on
specialized hardware enables to achieve substantial
speedups and power savings. However, with a large
legacy code base and software engineering experts, it
is not at all obvious how to easily utilize these new
architectures. As a result, there is a need for
comprehensive tool support to bridge the knowledge gap
of many engineers as well as to retarget legacy code.
In this article, we present the Quipu modeling
approach, which consists of a set of tools and a
modeling methodology that can generate hardware
estimation models, which provide valuable information
for developers. This information helps to focus their
efforts, to partition their application, and to select
the right heterogeneous components. We present Quipu 's
capability to generate domain-specific models, that are
up to several times more accurate within their
particular domain (error: 4.6\%) as compared to
domain-agnostic models (error: 23\%). Finally, we show
how Quipu can generate models for a new toolchain and
platform within a few days.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{deDinechin:2013:FPE,
author = "Florent de Dinechin and Pedro Echeverr{\'\i}a and
Marisa L{\'o}pez-Vallejo and Bogdan Pasca",
title = "Floating-Point Exponentiation Units for Reconfigurable
Computing",
journal = j-TRETS,
volume = "6",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457443.2457447",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:42 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The high performance and capacity of current FPGAs
makes them suitable as acceleration co-processors. This
article studies the implementation, for such
accelerators, of the floating-point power function $
x^y $ as defined by the C99 and IEEE 754-2008
standards, generalized here to arbitrary exponent and
mantissa sizes. Last-bit accuracy at the smallest
possible cost is obtained thanks to a careful study of
the various subcomponents: a floating-point logarithm,
a modified floating-point exponential, and a truncated
floating-point multiplier. A parameterized architecture
generator in the open-source FloPoCo project is
presented in details and evaluated.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Neely:2013:RTH,
author = "Christopher E. Neely and Gordon Brebner and Weijia
Shang",
title = "{ReShape}: Towards a High-Level Approach to Design and
Operation of Modular Reconfigurable Systems",
journal = j-TRETS,
volume = "6",
number = "1",
pages = "5:1--5:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457443.2457448",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:42 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The latest FPGA devices provide the headroom to
implement large-scale and complex systems. A key
requirement is the integration of modules from diverse
sources to promote modular design and reuse. A contrary
factor is that using dynamic partial reconfiguration
typically requires low-level planning of the system
implementation. In this article, we introduce ReShape:
a high-level approach for designing reconfigurable
systems by interconnecting modules, which gives a
``plug and play'' look and feel, is supported by tools
that carry out implementation functions, and is carried
through to support system reconfiguration during
operation. The emphasis is on the inter-module
connections and abstracting the communication patterns
that are typical between modules: for example, the
streaming of data, or the reading and writing of data
to and from memory modules. The details of wiring and
signaling are hidden from view, via metadata associated
with individual modules. This setting allows system
reconfiguration at the module level, both by supporting
type checking of replacement modules and by managing
the overall system implementation, via metadata
associated with its FPGA floorplan. The methodology and
tools have been implemented in a prototype targeted to
a domain-specific setting---high-speed networking---and
have been validated on real telecommunications design
projects.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Goehringer:2013:ISS,
author = "Diana Goehringer and Ren{\'e} Cumplido",
title = "Introduction to the special section on {19th
Reconfigurable Architectures Workshop (RAW 2012)}",
journal = j-TRETS,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499625.2499626",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:43 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sidiropoulos:2013:JFS,
author = "Harry Sidiropoulos and Kostas Siozios and Peter Figuli
and Dimitrios Soudris and Michael H{\"u}bner and
J{\"u}rgen Becker",
title = "{JITPR}: a framework for supporting fast application's
implementation onto {FPGAs}",
journal = j-TRETS,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492185",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:43 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The execution runtime usually is a headache for
designers performing application mapping onto
reconfigurable architectures. In this article we
propose a methodology, as well as the supporting
toolset, targeting to provide fast application
implementation onto reconfigurable architectures with
the usage of a Just-In-Time (JIT) compilation
framework. Experimental results prove the efficiency of
the introduced framework, as we reduce the execution
runtime compared to the state-of-the-art approach on
average by 53.5$ \times $. Additionally, the derived
solutions achieve higher operation frequencies by 1.17$
\times $, while they also exhibit significant lower
fragmentation ratios of hardware resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Heisswolf:2013:VND,
author = "Jan Heisswolf and Aurang Zaib and Andreas
Weichslgartner and Ralf K{\"o}nig and Thomas Wild and
J{\"u}rgen Teich and Andreas Herkersdorf and J{\"u}rgen
Becker",
title = "Virtual networks --- distributed communication
resource management",
journal = j-TRETS,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492186",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:43 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Networks-on-Chip (NoC) enable scalability for future
manycore architectures, facilitating parallel
communication between multiple cores. Applications
running in parallel on a NoC-based architecture can
affect each other due to overlapping communication.
Quality-of-Service (QoS) must be supported by the
communication infrastructure to execute communication-,
real-time- and safety-critical applications on such an
architecture. Different strategies have been proposed
to provide QoS for point-to-point connections. These
strategies allow each node to set up a limited number
of connections to other nodes. In this work Virtual
Networks (VN) are proposed to enable QoS for regions of
a NoC-based architecture. Virtual Networks overcome the
limitation of point-to-point connections. A VN behaves
like an exclusive physical network. Virtual Networks
can be defined and configured during runtime. The size
of the VN region and the assigned bandwidth can be
adjusted depending on the application requirements.
Virtual Networks enable the decoupling of local from
global communication. Therefore, the communication of
the application mapped into the region is assigned to a
Virtual Network established in that specific region.
This concept targets packet-switched networks with
virtual channels and is realized by an intelligent
hardware unit that manages the virtual channel
reservation process at system runtime. Virtual Networks
can be established and administrated independent of
each other, enabling distributed communication resource
management. The proposed concept is implemented as a
cycle-accurate SystemC simulation model. The simulation
results of executing communicating graphs obtained from
real application highlight the usefulness of Virtual
Networks by showing improved throughput and reduced
delay in the respective scenarios. A hardware
implementation demonstrates a low impact on area
utilization and power consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ganegedara:2013:CPA,
author = "Thilan Ganegedara and Viktor Prasanna",
title = "A comprehensive performance analysis of virtual
routers on {FPGA}",
journal = j-TRETS,
volume = "6",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492187",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:43 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Network virtualization has gained much popularity with
the advent of datacenter networking. The hardware
aspect of network virtualization, router
virtualization, allows network service providers to
consolidate network hardware, reducing equipment cost
and management overhead. Several approaches have been
proposed to achieve router virtualization to support
several virtual networks on a single hardware platform.
However, their performance has not been analyzed
quantitatively to understand the benefits of each
approach. In this work, we perform a comprehensive
analysis of performance of these approaches on Field
Programmable Gate Array (FPGA) with respect to memory
consumption, throughput, and power consumption.
Generalized versions of virtualization approaches are
evaluated based on post place-and-route results on a
state-of-the-art FPGA. Grouping of routing tables is
proposed as a novel approach to improve scalability
(i.e., the number of virtual networks hosted on a
single chip) of virtual routers on FPGA with respect to
memory requirement. Further, we employ floor-planning
techniques to efficiently utilize chip resources and
achieve high performance for virtualized, pipelined
architectures, resulting in 1.6$ \times $ speedup on
the average compared with the non-floor-planned
approach. The results indicate that the proposed
solution is able to support 100+ and 50 virtual routers
per chip in the near-best and near-worst case
scenarios, while operating at 20+ Gbps rates.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Das:2013:TDA,
author = "Joydip Das and Steven J. E. Wilton",
title = "Towards development of an analytical model relating
{FPGA} architecture parameters to routability",
journal = j-TRETS,
volume = "6",
number = "2",
pages = "10:1--10:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499625.2499627",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:43 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We present an analytical model relating FPGA
architectural parameters to the routability of the
FPGA. The inputs to the model include the channel width
and the connection and the switch block flexibilities.
The output is an estimate of the proportion of nets in
a large circuit that can be expected to be successfully
routed on the FPGA. We assume that the circuit is
routed to the FPGA using a single-step combined
global/detailed router. We show that the model
correctly predicts routability trends. We also present
an example application to demonstrate that this model
may be a valuable tool for FPGA architects. When
combined with the earlier works on analytical modeling,
our model can be used to quickly predict the
routability without going through any stage of an
expensive CAD flow. We envisage that this model will
benefit FPGA architecture designers and vendors to
quickly evaluate FPGA routing fabrics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Huang:2013:VHS,
author = "Chun-Hsian Huang and Pao-Ann Hsiung",
title = "Virtualizable hardware\slash software design
infrastructure for dynamically partially reconfigurable
systems",
journal = j-TRETS,
volume = "6",
number = "2",
pages = "11:1--11:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499625.2499628",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:43 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "In most existing works, reconfigurable hardware
modules are still managed as conventional hardware
devices. Further, the software reconfiguration overhead
incurred by loading corresponding device drivers into
the kernel of an operating system has been overlooked
until now. As a result, the enhancement of system
performance and the utilization of reconfigurable
hardware modules are still quite limited. This work
proposes a virtualizable hardware/software design
infrastructure (VDI) for dynamically partially
reconfigurable systems. Besides the gate-level hardware
virtualization provided by the partial reconfiguration
technology, VDI supports the device-level hardware
virtualization. In VDI, a reconfigurable hardware
module can be virtualized such that it can be accessed
efficiently by multiple applications in an interleaving
way. A Hot-Plugin Connector (HPC) replaces the
conventional device driver, such that it not only
assists the device-level hardware virtualization but
can also be reused across different hardware modules.
To facilitate hardware/software communication and to
enhance system scalability, the proposed VDI is
realized as a hierarchical design framework.
User-designed reconfigurable hardware modules can be
easily integrated into VDI, and are then executed as
hardware tasks in an operating system for
reconfigurable systems (OS4RS). A dynamically partially
reconfigurable network security system was designed
using VDI, which demonstrated a higher utilization of
reconfigurable hardware modules and a reduction by up
to 12.83\% of the processing time required by using the
conventional method in a dynamically partially
reconfigurable system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2013:INL,
author = "Hanyu Liu and Senthilkumar T. Rajavel and Ali Akoglu",
title = "Integration of Net-Length Factor with Timing- and
Routability-Driven Clustering Algorithms",
journal = j-TRETS,
volume = "6",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2517324",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:45 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In FPGA CAD flow, the clustering stage builds the
foundation for placement and routing stages and affects
performance parameters, such as routability, delay, and
channel width significantly. Net sharing and
criticality are the two most commonly used factors in
clustering cost functions. With this study, we first
derive a third term, net-length factor, and then design
a generic method for integrating net length into the
clustering algorithms. Net-length factor enables
characterizing the nets based on the routing stress
they might cause during later stages of the CAD flow
and is essential for enhancing the routability of the
design. We evaluate the effectiveness of integrating
net length as a factor into the well-known timing
(T-VPack)-, depopulation (T-NDPack)-, and routability
(iRAC and T-RPack)-driven clustering algorithms.
Through exhaustive experimental studies, we show that
net-length factor consistently helps improve the
channel-width performance of routability-,
depopulation-, and timing-driven clustering algorithms
that do not explicitly target low fan-out nets in their
cost functions. Particularly, net-length factor leads
to average reduction in channel width for T-VPack,
T-RPack, and T-NDPack by 11.6\%, 10.8\%, and 14.2\%,
respectively, and in a majority of the cases, improves
the critical-path delay without increasing the array
size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mehta:2013:UGE,
author = "Gayatri Mehta and Carson Crawford and Xiaozhong Luo
and Natalie Parde and Krunalkumar Patel and Brandon
Rodgers and Anil Kumar Sistla and Anil Yadav and Marc
Reisner",
title = "{UNTANGLED}: a Game Environment for Discovery of
Creative Mapping Strategies",
journal = j-TRETS,
volume = "6",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2517325",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:45 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The problem of creating efficient mappings of dataflow
graphs onto specific architectures (i.e., solving the
place and route problem) is incredibly challenging. The
difficulty is especially acute in the area of
Coarse-Grained Reconfigurable Architectures (CGRAs) to
the extent that solving the mapping problem may remove
a significant bottleneck to adoption. We believe that
the next generation of mapping algorithms will exhibit
pattern recognition, the ability to learn from
experience, and identification of creative solutions,
all of which are human characteristics. This manuscript
describes our game UNTANGLED, developed and fine-tuned
over the course of a year to allow us to capture and
analyze human mapping strategies. It also describes our
results to date. We find that the mapping problem can
be crowdsourced very effectively, that players can
outperform existing algorithms, and that successful
player strategies share many elements in common. Based
on our observations and analysis, we make concrete
recommendations for future research directions for
mapping onto CGRAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hormigo:2013:SRC,
author = "Javier Hormigo and Gabriel Caffarena and Juan P.
Oliver and Eduardo Boemo",
title = "Self-Reconfigurable Constant Multiplier for {FPGA}",
journal = j-TRETS,
volume = "6",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490830",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:45 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Constant multipliers are widely used in signal
processing applications to implement the multiplication
of signals by a constant coefficient. However, in some
applications, this coefficient remains invariable only
during an interval of time, and then, its value changes
to adapt to new circumstances. In this article, we
present a self-reconfigurable constant multiplier
suitable for LUT-based FPGAs able to reload the
constant in runtime. The pipelined architecture
presented is easily scalable to any multiplicand and
constant sizes, for unsigned and signed
representations. It can be reprogrammed in 16 clock
cycles, equivalent to less than 100 ns in current
FPGAs. This value is significantly smaller than FPGA
partial configuration times. The presented approach is
more efficient in terms of area and speed when compared
to generic multipliers, achieving up to 91\% area
reduction and up to 102\% speed improvement for the
case-study circuits tested. The power consumption of
the proposed multipliers are in the range of those of
slice-based multipliers provided by the vendor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gharibian:2013:ASL,
author = "Farnaz Gharibian and Lesley Shannon and Peter Jamieson
and Kevin Chung",
title = "Analyzing System-Level Information's Correlation to
{FPGA} Placement",
journal = j-TRETS,
volume = "6",
number = "3",
pages = "15:1--15:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501985",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:45 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "One popular placement algorithms for
Field-Programmable Gate Arrays (FPGAs) is called
Simulated Annealing (SA). This algorithm tries to
create a good quality placement from a flattened design
that no longer contains any high-level information
related to the original design hierarchy. Placement is
an NP-hard problem, and as the size and complexity of
designs implemented on FPGAs increases, SA does not
scale well to find good solutions in a timely fashion.
In this article, we investigate if system-level
information can be reconstructed from a flattened
netlist and evaluate how that information is realized
in terms of its locality in the final placement. If
there is a strong relationship between good quality
placements and system-level information, then it may be
possible to divide a large design into smaller
components and improve the time needed to create a good
quality placement. Our preliminary results suggest that
the locality property of the information embedded in
the system-level HDL structure (i.e. ``module'',
``always'', and ``if'' statements) is greatly affected
by designer HDL coding style. Therefore, a
reconstructive algorithm, called Affinity Propagation,
is also considered as a possible method of generating a
meaningful coarse-grain picture of the design.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Plavec:2013:ETD,
author = "Franjo Plavec and Zvonko Vranesic and Stephen Brown",
title = "Exploiting Task- and Data-Level Parallelism in
Streaming Applications Implemented in {FPGAs}",
journal = j-TRETS,
volume = "6",
number = "4",
pages = "16:1--16:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2535932",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:46 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article describes the design and implementation
of a novel compilation flow that implements circuits in
FPGAs from a streaming programming language. The
streaming language supported is called FPGA Brook and
is based on the existing Brook language. It allows
system designers to express applications in a way that
exposes parallelism, which can be exploited through
hardware implementation. FPGA Brook supports
replication, allowing parts of an application to be
implemented as multiple hardware units operating in
parallel. Hardware units are interconnected through
FIFO buffers which use the small memory modules
available in FPGAs. The FPGA Brook automated design
flow uses a source-to-source compiler, developed as a
part of this work, and combines it with a commercial
behavioral synthesis tool to generate the hardware
implementation. A suite of benchmark applications was
developed in FPGA Brook and implemented using our
design flow. Experimental results indicate that
performance of many applications scales well with
replication. Our benchmark applications also achieve
significantly better results than corresponding
implementations using a commercial behavioral synthesis
tool. We conclude that using an automated design flow
for implementation of streaming applications in FPGAs
is a promising methodology.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ananthan:2013:RPH,
author = "T. Ananthan and M. V. Vaidyan",
title = "A Reconfigurable Parallel Hardware Implementation of
the Self-Tuning Regulator",
journal = j-TRETS,
volume = "6",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2535934",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:46 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The self-tuning regulator (STR) is a popular adaptive
control algorithm. A high-performance computer is
required for its implementation due to the heavy online
computational burden. To extend STR for more real-time
applications, a parallel hardware implementation on a
low-cost reconfigurable computer is presented. The
hardware was incorporated with multistage matrix
multiplication (MMM) and trace technique to enhance the
processing speed. This design was deeply pipelined to
achieve high throughput. The algorithm was prototyped
on a Xilinx field-programmable gate array (FPGA) device
with a maximum operating frequency of 210.436 MHz.
Application-specific integrated circuit (ASIC)
implementation of STR was reported.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Leow:2013:AME,
author = "Yoon Kah Leow and Ali Akoglu and Susan Lysecky",
title = "An Analytical Model for Evaluating Static Power of
Homogeneous {FPGA} Architectures",
journal = j-TRETS,
volume = "6",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2535935",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:46 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "As capacity of the field-programmable gate arrays
(FPGAs) continues to increase, power dissipated in the
logic and routing resources has become a critical
concern for FPGA architects. Recent studies have shown
that static power is fast approaching the dynamic power
in submicron devices. In this article, we propose an
analytical model for relating homogeneous
island-style-based FPGA architecture to static power.
Current FPGA power models are tightly coupled with CAD
tools. Our CAD-independent model captures the static
power for a given FPGA architecture based on estimates
of routing and logic resource utilizations from a
pre-technology mapped netlist. We observe an average
correlation ratio (C-Ratio) of 95\% and a minimum
absolute percentage error (MAPE) rate of 15\% with
respect to the experimental results generated by the
Versatile Placement Routing (VPR) tool over the MCNC
benchmarks. Our model offers application engineers and
FPGA architects the capability to evaluate the impact
of their design choices on static power without having
to go through CAD-intensive investigations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ben-Asher:2013:OWS,
author = "Yosi Ben-Asher and Ron Meldiner and Nadav Rotem",
title = "Optimizing Wait States in the Synthesis of Memory
References with Unpredictable Latencies",
journal = j-TRETS,
volume = "6",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2535936",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:46 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We consider the problem of synthesizing circuits (from
C to Verilog) that are optimized to handle
unpredictable latencies of memory operations.
Unpredictable memory latencies can occur due to the use
of on chip caches, DRAM memory modules, buffers/queues,
or multiport memories. Typically, high-level synthesis
compilers assume fixed and known memory latencies, and
thus are able to schedule the code's operations
efficiently. The operations in the source code are
scheduled into states of a state machine whose states
will be synthesized to Verilog. The goal is to minimize
scheduling length by maximizing the number of
operations (and in particular memory operations) that
are executed in parallel at the same state. However,
with unpredictable latencies, there can be an
exponential number of possible orders in which these
parallel memory operations can terminate. Thus, in
order to minimize the scheduling, we need a different
schedule for any such order. This is not practical, and
we show a technique of synthesizing a compact state
machine that schedules only a small subset of these
possible termination orders. Our results show that this
compact state machine can improve the execution time
compared to a regular scheduling that waits for the
termination of all the active memory references in
every state.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kornaros:2014:DPT,
author = "George Kornaros and Dionisios Pnevmatikatos",
title = "Dynamic Power and Thermal Management of {NoC-Based}
Heterogeneous {MPSoCs}",
journal = j-TRETS,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567658",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:47 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Advances in silicon process technology have made it
possible to include multiple processor cores on a
single die. Billion transistor architectures usually in
the form of networks-on-chip present a wide range of
challenges in design, microarchitecture, and
algorithmic levels with significant impact to system
performance and power consumption. In this article, we
propose efficient methods and mechanisms that exploit a
heterogeneous network-on-chip (NoC) to achieve a power-
and thermal-aware coherent system. To this end, we
utilize different management techniques which employ
dynamic frequency scaling circuitry and power and
temperature sensors per node to achieve real-time
workload prediction and allocation at node and system
level by low-cost threads. The developed heterogeneous
multicoprocessing infrastructure is utilized to
evaluate diverse policies for power-aware computing in
terms of effectiveness and in relation to distributed
sensor-conscious management. The proposed
reconfigurable architecture supports coprocessor
accelerators per node, monitors the program's power
profile on-the-fly, and balances power and thermal
behavior at the NoC level. Overall, these techniques
form a system exploration methodology using a
multi-FPGA emulation platform showing a minimum
complexity overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Iskander:2014:HLA,
author = "Yousef Iskander and Cameron Patterson and Stephen
Craven",
title = "High-Level Abstractions and Modular Debugging for
{FPGA} Design Validation",
journal = j-TRETS,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567662",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:47 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Design validation is the most time-consuming task in
the FPGA design cycle. Although manufacturers and
third-party vendors offer a range of tools that provide
visibility and control of the different stages of a
design, many require that the design be fully
re-implemented for even simple parameter modifications
or do not allow the design to be run at full speed.
Designs are typically first modeled using a high-level
language then later rewritten in a hardware description
language, first for simulation and then later modified
for synthesis. IP and third-party cores may differ
during these final two stages complicating development
and validation. The developed approach provides two
means of directly validating synthesized hardware
designs. The first allows the original high-level model
written in C or C++ to be directly coupled to the
synthesized hardware, abstracting away the traditional
gate-level view of designs. A high-level programmatic
interface allows the synthesized design to be validated
directly by the software reference model. The second
approach provides an alternative view to FPGAs within
the scope of a traditional software debugger. This
debug framework leverages partially reconfigurable
regions to accelerate the modification of dynamic,
software-like breakpoints for low-level analysis and
provides a automatable, scriptable, command-line
interface directly to a running design on an FPGA.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jin:2014:FAS,
author = "Minxi Jin and Tsutomu Maruyama",
title = "Fast and Accurate Stereo Vision System on {FPGA}",
journal = j-TRETS,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567659",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:47 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we present a fast and high quality
stereo matching algorithm on FPGA using cost
aggregation (CA) and fast locally consistent (FLC)
dense stereo. In many software programs, global
matching algorithms are used in order to obtain
accurate disparity maps. Although their error rates are
considerably low, their processing speeds are far from
that required for real-time processing because of their
complex processing sequences. In order to realize
real-time processing, many hardware systems have been
proposed to date. They have achieved considerably high
processing speeds; however, their error rates are not
as good as those of software programs, because simple
local matching algorithms have been widely used in
those systems. In our system, sophisticated local
matching algorithms (CA and FLC) that are suitable for
FPGA implementation are used to achieve low error rate
while maintaining the high processing speed. We
evaluate the performance of our circuit on Xilinx
Vertex-6 FPGAs. Its error rate is comparable to that of
top-level software algorithms, and its processing speed
is nearly 2 clock cycles per pixel, which reaches 507.9
fps for 640 480 pixel images.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ulusel:2014:FDE,
author = "Onur Ulusel and Kumud Nepal and R. Iris Bahar and
Sherief Reda",
title = "Fast Design Exploration for Performance, Power and
Accuracy Tradeoffs in {FPGA-Based} Accelerators",
journal = j-TRETS,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567661",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:47 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The ease-of-use and reconfigurability of FPGAs makes
them an attractive platform for accelerating
algorithms. However, accelerating becomes a challenging
task as the large number of possible design parameters
lead to different accelerator variants. In this
article, we propose techniques for fast design
exploration and multi-objective optimization to quickly
identify both algorithmic and hardware parameters that
optimize these accelerators. This information is used
to run regression analysis and train mathematical
models within a nonlinear optimization framework to
identify the optimal algorithm and design parameters
under various objectives and constraints. To automate
and improve the model generation process, we propose
the use of L$_1$ -regularized least squares regression
techniques.We implement two real-time image processing
accelerators as test cases: one for image deblurring
and one for block matching. For these designs, we
demonstrate that by sampling only a small fraction of
the design space (0.42\% and 1.1\%), our modeling
techniques are accurate within 2\%--4\% for area and
throughput, 8\%--9\% for power, and 5\%--6\% for
arithmetic accuracy. We show speedups of 340$ \times $
and 90$ \times $ in time for the test cases compared to
brute-force enumeration. We also identify the optimal
set of parameters for a number of scenarios (e.g.,
minimizing power under arithmetic inaccuracy bounds).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kim:2014:FPF,
author = "Lok-Won Kim and Sameh Asaad and Ralph Linsker",
title = "A Fully Pipelined {FPGA} Architecture of a Factored
Restricted {Boltzmann} Machine Artificial Neural
Network",
journal = j-TRETS,
volume = "7",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539125",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Mar 13 08:09:47 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Artificial neural networks (ANNs) are a natural target
for hardware acceleration by FPGAs and GPGPUs because
commercial-scale applications can require days to weeks
to train using CPUs, and the algorithms are highly
parallelizable. Previous work on FPGAs has shown how
hardware parallelism can be used to accelerate a
``Restricted Boltzmann Machine'' (RBM) ANN algorithm,
and how to distribute computation across multiple
FPGAs. Here we describe a fully pipelined parallel
architecture that exploits ``mini-batch'' training
(combining many input cases to compute each set of
weight updates) to further accelerate ANN training. We
implement on an FPGA, for the first time to our
knowledge, a more powerful variant of the basic RBM,
the ``Factored RBM'' (fRBM). The fRBM has proved
valuable in learning transformations and in discovering
features that are present across multiple types of
input. We obtain (in simulation) a 100-fold
acceleration (vs. CPU software) for an fRBM having N =
256 units in each of its four groups (two input, one
output, one intermediate group of units) running on a
Virtex-6 LX760 FPGA. Many of the architectural features
we implement are applicable not only to fRBMs, but to
basic RBMs and other ANN algorithms more broadly.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Luu:2014:VNG,
author = "Jason Luu and Jeffrey Goeders and Michael Wainberg and
Andrew Somerville and Thien Yu and Konstantin
Nasartschuk and Miad Nasr and Sen Wang and Tim Liu and
Nooruddin Ahmed and Kenneth B. Kent and Jason Anderson
and Jonathan Rose and Vaughn Betz",
title = "{VTR 7.0}: Next Generation Architecture and {CAD}
System for {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617593",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Exploring architectures for large, modern FPGAs
requires sophisticated software that can model and
target hypothetical devices. Furthermore, research into
new CAD algorithms often requires a complete and open
source baseline CAD flow. This article describes recent
advances in the open source Verilog-to-Routing (VTR)
CAD flow that enable further research in these areas.
VTR now supports designs with multiple clocks in both
timing analysis and optimization. Hard adder/carry
logic can be included in an architecture in various
ways and significantly improves the performance of
arithmetic circuits. The flow now models energy
consumption, an increasingly important concern. The
speed and quality of the packing algorithms have been
significantly improved. VTR can now generate a netlist
of the final post-routed circuit which enables detailed
simulation of a design for a variety of purposes. We
also release new FPGA architecture files and models
that are much closer to modern commercial
architectures, enabling more realistic experiments.
Finally, we show that while this version of VTR
supports new and complex features, it has a 1.5$ \times
$ compile time speed-up for simple architectures and a
6$ \times $ speed-up for complex architectures compared
to the previous release, with no degradation to timing
or wire-length quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{J:2014:MAN,
author = "Soumya J. and Ashish Sharma and Santanu
Chattopadhyay",
title = "Multi-Application Network-on-Chip Design using Global
Mapping and Local Reconfiguration",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2556944",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article proposes a reconfigurable Network-on-Chip
(NoC) architecture based on mesh topology. It provides
a local reconfiguration of cores to connect to any of
the neighboring routers, depending upon the currently
executing application. The area overhead for this local
reconfiguration has been shown to be very small. We
have also presented the strategy to map the cores of an
application set onto this architecture. This has been
achieved via a two-phase procedure. In the first phase,
the cores of the combined application set are mapped
tentatively to individual routers, minimizing the
communication cost. In the second phase, for each
application, positions of individual cores are
finalized. A core gets attached to any neighbor of its
tentative allocation. We have proposed Integer Linear
Programming (ILP) formulation of both the phases. Since
ILP takes large amount of CPU time, we have also
formulated a Particle Swarm Optimization (PSO)-based
solution for the two phases. A heuristic approach has
also been developed for the reconfiguration. Comparison
of communication cost, latency and network energy have
been carried out for the applications, before and after
reconfiguration. It shows significant improvement in
performance via reconfiguration.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lei:2014:FIS,
author = "Yuanwu Lei and Lei Guo and Yong Dou and Sheng Ma and
Jinbo Xu",
title = "{FPGA} Implementation of a Special-Purpose {VLIW}
Structure for Double-Precision Elementary Function",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617594",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In the current article, the capability and flexibility
of field programmable gate-arrays (FPGAs) to implement
IEEE-754 double-precision floating-point elementary
functions are explored. To perform various elementary
functions on the unified hardware efficiently, we
propose a special-purpose very long instruction word
(VLIW) processor, called DP_VELP. This processor is
equipped with multiple basic units, and its performance
is improved through an explicitly parallel technique.
Pipelined evaluation of polynomial approximation with
Estrin's scheme is proposed, by scheduling basic
components in an optimal order to avoid data hazard
stalls and achieve minimal latency. The custom VLIW
processor can achieve high scalability. Under the
control of specific VLIW instructions, the basic units
are combined into special-purpose hardware for
elementary functions. Common elementary functions are
presented as examples to illustrate the design of
elementary function in DP_VELP in detail. Minimax
approximation scheme is used to reduce degree of
polynomial. Compromise between the size of lookup table
and the latency is discussed, and the internal
precision is carefully planned to guarantee accuracy of
the result. Finally, we create a prototype of the
DP_VELP unit and an FPGA accelerator based on the
DP_VELP unit on a Xilinx XC6VLX760 FPGA chip to
implement the SGP4/SDP4 application. Compared with
previous researches, the proposed design can achieve
low latency with a reasonable amount of resources and
evaluate a variety of elementary functions with the
unified hardware to satisfy the demands in scientific
applications. Experimental results show that the
proposed design guarantees more than 99\% of correct
rounding. Moreover, the SGP4/SDP4 accelerator, which is
equipped with 39 DP_VELP units and runs at 200 MHz,
outperforms the parallel software approach with
hyper-thread technology on an Intel Xeon Quad E5620 CPU
at 2.40 GHz by a factor of 7X.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Clemente:2014:MSA,
author = "Juan Antonio Clemente and Ivan Beretta and Vincenzo
Rana and David Atienza and Donatella Sciuto",
title = "A Mapping-Scheduling Algorithm for Hardware
Acceleration on Reconfigurable Platforms",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2611562",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Reconfigurable platforms are a promising technology
that offers an interesting trade-off between
flexibility and performance, which many recent embedded
system applications demand, especially in fields such
as multimedia processing. These applications typically
involve multiple ad-hoc tasks for hardware
acceleration, which are usually represented using
formalisms such as Data Flow Diagrams (DFDs), Data Flow
Graphs (DFGs), Control and Data Flow Graphs (CDFGs) or
Petri Nets. However, none of these models is able to
capture at the same time the pipeline behavior between
tasks (that therefore can coexist in order to minimize
the application execution time), their communication
patterns, and their data dependencies. This article
proves that the knowledge of all this information can
be effectively exploited to reduce the resource
requirements and the timing performance of modern
reconfigurable systems, where a set of hardware
accelerators is used to support the computation. For
this purpose, this article proposes a novel task
representation model, named Temporal Constrained Data
Flow Diagram (TCDFD), which includes all this
information. This article also presents a
mapping-scheduling algorithm that is able to take
advantage of the new TCDFD model. It aims at minimizing
the dynamic reconfiguration overhead while meeting the
communication requirements among the tasks.
Experimental results show that the presented approach
achieves up to 75\% of resources saving and up to 89\%
of reconfiguration overhead reduction with respect to
other state-of-the-art techniques for reconfigurable
platforms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hoang:2014:IMD,
author = "Anh-Tuan Hoang and Takeshi Fujino",
title = "Intra-Masking Dual-Rail Memory on {LUT} Implementation
for {SCA}-Resistant {AES} on {FPGA}",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617595",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In current countermeasure design trends against
differential power analysis (DPA), security at gate
level is required in addition to the security
algorithm. Several dual-rail pre-charge logics (DPL)
have been proposed to achieve this goal. Designs using
ASIC can attain this goal owing to its backend design
restrictions on placement and routing. However,
implementing these designs on field programmable gate
arrays (FPGA) without information leakage is still a
problem because of the difficulty involved in the
restrictions on placement and routing on FPGA. This
article describes our novel masked dual-rail
pre-charged memory approach, called `intra-masking
dual-rail memory (IMDRM) on LUT', and its
implementation on FPGA for Side-Channel
Attack-resistant (SCA-resistant) AES. In the proposed
design, all unsafe nodes, such as unmasking and
masking, and parts of dual-rail memory with unsafe
buses (buses that are not masked) are packed into a
single LUT. This makes them balanced and independent of
the placement and routing tools. Inputs and outputs of
all LUTs are masked, and so can be considered safe
signals. Several LUTs can be combined to create a safe
SBox. The design is independent of the cryptographic
algorithm, and hence, it can be applied to available
cryptographic standards such as DES or AES as well as
future standards. It requires no special placement or
route constraints in its implementation. A correlation
power analysis (CPA) attack on 1,000,000 traces of AES
implementation on FPGA showed that the secret
information is well protected against first-order
side-channel attacks. Even though the number of LUTs
used for memory in this implementation is seven times
greater than that of the conventional unprotected
single-rail memory table-lookup AES and three times
greater than the implementation based on a composite
field, it requires a smaller number of LUTs than all
other advanced SCA-resistant implementations such as
the wave dynamic differential logic, masked dual-rail
pre-charge logic, and threshold.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Becker:2014:ITS,
author = "Tobias Becker",
title = "Introduction to the {TRETS} Special Section on the
{Workshop on Self-Awareness in Reconfigurable Computing
Systems (SRCS'12)}",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2611564",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Panerati:2014:CIL,
author = "Jacopo Panerati and Martina Maggio and Matteo
Carminati and Filippo Sironi and Marco Triverio and
Marco D. Santambrogio",
title = "Coordination of Independent Loops in Self-Adaptive
Systems",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2611563",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Nowadays, the same piece of code should run on
different architectures, providing performance
guarantees in a variety of environments and situations.
To this end, designers often integrate existing systems
with ad-hoc adaptive strategies able to tune specific
parameters that impact performance or energy-for
example, frequency scaling. However, these strategies
interfere with one another and unpredictable
performance degradation may occur due to the
interaction between different entities. In this
article, we propose a software approach to
reconfiguration when different strategies, called
loops, are encapsulated in the system and are available
to be activated. Our solution to loop coordination is
based on machine learning and it selects a policy for
the activation of loops inside of a system without
prior knowledge. We implemented our solution on top of
GNU/Linux and evaluated it with a significant subset of
the PARSEC benchmark suite.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Agne:2014:SAM,
author = "Andreas Agne and Markus Happe and Achim L{\"o}sch and
Christian Plessl and Marco Platzner",
title = "Self-Awareness as a Model for Designing and Operating
Heterogeneous Multicores",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617596",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Self-aware computing is a paradigm for structuring and
simplifying the design and operation of computing
systems that face unprecedented levels of system
dynamics and thus require novel forms of adaptivity.
The generality of the paradigm makes it applicable to
many types of computing systems and, previously,
researchers started to introduce concepts of
self-awareness to multicore architectures. In our work
we build on a recent reference architectural framework
as a model for self-aware computing and instantiate it
for an FPGA-based heterogeneous multicore running the
ReconOS reconfigurable architecture and operating
system. After presenting the model for self-aware
computing and ReconOS, we demonstrate with a case study
how a multicore application built on the principle of
self-awareness, autonomously adapts to changes in the
workload and system state. Our work shows that the
reference architectural framework as a model for
self-aware computing can be practically applied and
allows us to structure and simplify the design process,
which is essential for designing complex future
computing systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Beckhoff:2014:DTI,
author = "Christian Beckhoff and Dirk Koch and Jim Torresen",
title = "Design Tools for Implementing Self-Aware and
Fault-Tolerant Systems on {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "14:1--14:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617597",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "To fully exploit the capabilities of runtime
reconfigurable FPGAs in self-aware systems, design
tools are required that exceed the capabilities of
present vendor design tools. Such tools must allow the
implementation of scalable reconfigurable systems with
various different partial modules that might be loaded
to different positions of the device at runtime. This
comprises several complex tasks, including
floorplanning, communication architecture synthesis,
physical constraints generation, physical
implementation, and timing verification all the way
down to the final bitstream generation. In this
article, we present how our GoAhead framework helps in
implementing self-aware systems on FPGAs with a minimum
of user interaction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Niu:2014:SAT,
author = "Xinyu Niu and Qiwei Jin and Wayne Luk and Stephen
Weston",
title = "A Self-Aware Tuning and Self-Aware Evaluation Method
for Finite-Difference Applications in Reconfigurable
Systems",
journal = j-TRETS,
volume = "7",
number = "2",
pages = "15:1--15:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617598",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jun 30 18:26:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Finite-difference methods are computationally
intensive and required by many applications. Parameters
of a finite-difference algorithm, such as grid size,
can be varied to generate design space which contains
algorithm instances with different constant
coefficients. An algorithm instance with specific
coefficients can either be mapped into general
operators to construct static designs, or be
implemented as constant-specific operators to form
dynamic designs, which require runtime reconfiguration
to update algorithm coefficients. This article proposes
a tuning method to explore the design space to optimise
both the static and the dynamic designs, and an
evaluation method to select the design with maximum
overall throughput, based on algorithm characteristics,
design properties, available resources and runtime data
size. For benchmark applications option pricing and
Reverse-Time Migration (RTM), over 50\% reduction in
resource consumption has been achieved for both static
designs and dynamic designs, while meeting precision
requirements. For a single hardware implementation, the
RTM design optimised with the proposed approach is
expected to run 1.8 times faster than the best
published design. The tuned static designs run
thousands of times faster than the dynamic designs for
algorithms with small data size, while the tuned
dynamic designs achieve up to 5.9 times speedup over
the corresponding static designs for large-scale
finite-difference algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Laforest:2014:CMP,
author = "Charles Eric Laforest and Zimo Li and Tristan O'rourke
and Ming G. Liu and J. Gregory Steffan",
title = "Composing Multi-Ported Memories on {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "16:1--16:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629629",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Multi-ported memories are challenging to implement on
FPGAs since the block RAMs included in the fabric
typically have only two ports. Hence we must construct
memories requiring more than two ports, either out of
logic elements or by combining multiple block RAMs. We
present a thorough exploration and evaluation of the
design space of FPGA-based soft multi-ported memories
for conventional solutions, and also for the recently
proposed Live Value Table (LVT) [LaForest and Steffan
2010] and XOR [LaForest et al. 2012] approaches to
unidirectional port memories, reporting results for
both Altera and Xilinx FPGAs. Additionally, we
thoroughly evaluate and compare with a recent LVT-based
approach to bidirectional port memories [Choi et al.
2012].",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Peng:2014:BAH,
author = "Yuanxi Peng and Manuel Salda{\~n}a and Christopher A.
Madill and Xiaofeng Zou and Paul Chow",
title = "Benefits of Adding Hardware Support for Broadcast and
Reduce Operations in {MPSoC} Applications",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629470",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "MPI has been used as a parallel programming model for
supercomputers and clusters and recently in
MultiProcessor Systems-on-Chip (MPSoC). One component
of MPI is collective communication and its performance
is key for certain parallel applications to achieve
good speedups. Previous work showed that, with
synthetic communication-only benchmarks, communication
improvements of up to 11.4-fold and 22-fold for
broadcast and reduce operations, respectively, can be
achieved by providing hardware support at the network
level in a Network-on-Chip (NoC). However, these
numbers do not provide a good estimation of the
advantage for actual applications, as there are other
factors that affect performance besides communications,
such as computation. To this end, we extend our
previous work by evaluating the impact of hardware
support over a set of five parallel application kernels
of varying computation-to-communication ratios. By
introducing some useful computation to the performance
evaluation, we obtain more representative results of
the benefits of adding hardware support for broadcast
and reduce operations. The experiments show that
applications with lower computation-to-communication
ratios benefit the most from hardware support as they
highly depend on efficient collective communications to
achieve better scalability. We also extend our work by
doing more analysis on clock frequency, resource usage,
power, and energy. The results show reasonable
scalability for resource utilization and power in the
network interfaces as the number of channels increases
and that, even though more power is dissipated in the
network interfaces due to the added hardware, the total
energy used can still be less if the actual speedup is
sufficient. The application kernels are executed in a
24-embedded-processor system distributed across four
FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Anderson:2014:ISI,
author = "Jason Anderson and Kiyoung Choi",
title = "Introduction to the {Special Issue on the 11th
International Conference on Field-Programmable
Technology (FPT'12)}",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2655712",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cheah:2014:IDB,
author = "Hui Yan Cheah and Fredrik Brosser and Suhaib A. Fahmy
and Douglas L. Maskell",
title = "The {iDEA DSP} Block-Based Soft Processor for
{FPGAs}",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "19:1--19:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629443",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "DSP blocks in modern FPGAs can be used for a wide
range of arithmetic functions, offering increased
performance while saving logic resources for other
uses. They have evolved to better support a plethora of
signal processing tasks, meaning that in other
application domains they may be underutilised. The
DSP48E1 primitives in new Xilinx devices support
dynamic programmability that can help extend their
usefulness; the specific function of a DSP block can be
modified on a cycle-by-cycle basis. However, the
standard synthesis flow does not leverage this
flexibility in the vast majority of cases. The lean DSP
Extension Architecture (iDEA) presented in this article
builds around the dynamic programmability of a single
DSP48E1 primitive, with minimal additional logic to
create a general-purpose processor supporting a full
instruction-set architecture. The result is a very
compact, fast processor that can execute a full gamut
of general machine instructions. We show a number of
simple applications compiled using an MIPS compiler and
translated to the iDEA instruction set, comparing with
a Xilinx MicroBlaze to show estimated performance
figures. Being based on the DSP48E1, this processor can
be deployed across next-generation Xilinx Artix-7,
Kintex-7, Virtex-7, and Zynq families.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Abdelfattah:2014:NCF,
author = "Mohamed S. Abdelfattah and Vaughn Betz",
title = "Networks-on-Chip for {FPGAs}: Hard, Soft or Mixed?",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "20:1--20:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629442",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "As FPGA capacity increases, a growing challenge is
connecting ever-more components with the current
low-level FPGA interconnect while keeping designers
productive and on-chip communication efficient. We
propose augmenting FPGAs with networks-on-chip (NoCs)
to simplify design, and we show that this can be done
while maintaining or even improving silicon efficiency.
We compare the area and speed efficiency of each NoC
component when implemented hard versus soft to explore
the space and inform our design choices. We then build
on this component-level analysis to architect hard NoCs
and integrate them into the FPGA fabric; these NoCs are
on average 20--23$ \times $ smaller and 5--6$ \times $
faster than soft NoCs. A 64-node hard NoC uses only
~2\% of an FPGA's silicon area and metallization. We
introduce a new communication efficiency metric:
silicon area required per realized communication
bandwidth. Soft NoCs consume 4960 mm$^2$ /TBps, but
hard NoCs are 84$ \times $ more efficient at 59 mm$^2$
/TBps. Informed design can further reduce the area
overhead of NoCs to 23 mm$^2$ /TBps, which is only 2.6$
\times $ less efficient than the simplest
point-to-point soft links (9 mm$^2$ /TBps). Despite
this almost comparable efficiency, NoCs can switch data
across the entire FPGA while point-to-point links are
very limited in capability; therefore, hard NoCs are
expected to improve FPGA efficiency for more complex
styles of communication.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2014:GMA,
author = "Liang Chen and Tulika Mitra",
title = "Graph Minor Approach for Application Mapping on
{CGRAs}",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "21:1--21:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2655242",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Coarse-Grained Reconfigurable Arrays (CGRAs) exhibit
high performance, improved flexibility, low cost, and
power efficiency for various application domains.
Compute-intensive loop kernels, which are perfect
candidates to be executed on CGRAs, are usually mapped
through modified modulo scheduling algorithms. These
algorithms should be capable of performing both
placement and routing. We formalize the CGRA mapping
problem as a graph minor containment problem. We
essentially test whether the dataflow graph
representing the loop kernel is a minor of the modulo
routing resource graph representing the CGRA resources
and their interconnects. We design an exact graph minor
testing approach that exploits the unique properties of
both the dataflow graph and the routing resource graph
to significantly prune the search space. We introduce
additional heuristic strategies that drastically
improve the compilation time while still generating
optimal or near-optimal mapping solutions. Experimental
evaluation confirms the efficiency of our approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kim:2014:USU,
author = "Changmoo Kim and Mookyoung Chung and Yeongon Cho and
Mario Konijnenburg and Soojung Ryu and Jeongwook Kim",
title = "{ULP-SRP}: Ultra Low-Power {Samsung} Reconfigurable
Processor for Biomedical Applications",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "22:1--22:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629610",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The latest biomedical applications require low energy
consumption, high performance, and wide
energy-performance scalability to adapt to various
working environments. In this study, we present
ULP-SRP, an energy-efficient reconfigurable processor
for biomedical applications. ULP-SRP uses a
Coarse-Grained Reconfigurable Array (CGRA) for
high-performance data processing with low energy
consumption. We adopted a compact-size CGRA and
modified it to support dynamically switchable three
performance modes with fine-grained power gating in
order to further optimize the energy consumption. The
energy-performance scalability is also accomplished
with multiple performance modes and a Unified Memory
Architecture (UMA). Experimental results show that
ULP-SRP achieved 59\% energy reduction compared to
previous works. A technique of dynamic CGRA mode
changing gives 18.9\% energy reduction. ULP-SRP is a
good candidate for future mobile healthcare devices.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Voros:2014:ISI,
author = "Nikolaos Voros and Guy Gogniat",
title = "Introduction to the Special Issue on the {7th
International Workshop on Reconfigurable
Communication-centric Systems-on-Chip (ReCoSoC'12)}",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "23:1--23:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2655710",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Brugger:2014:RRF,
author = "Christian Brugger and Dominic Hillenbrand and Matthias
Balzer",
title = "{RIVER}: Reconfigurable Flow and Fabric for Real-Time
Signal Processing on {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "24:1--24:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2655238",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "For high-performance embedded hard-real-time systems,
ASICs and FPGAs hold advantages over general-purpose
processors and graphics accelerators (GPUs). However,
developing signal processing architectures from scratch
requires significant resources. Our design methodology
is based on sets of configurable building blocks that
provide storage, dataflow, computation, and control.
Based on our building blocks, we generate hundreds of
thousands of our dynamic streaming engine processors
that we call DSEs. We store our DSEs in a repository
that can be queried for (online) design space
exploration. From this repository, DSEs can be
downloaded and instantiated within milliseconds on
FPGAs. If a loss of flexibility can be tolerated then
ASIC implementations are feasible as well. In this
article we focus on FPGA implementations. Our DSEs vary
in cores, computational lanes, bitwidths, power
consumption, and frequency. To the best of our
knowledge we are the first to propose online design
space exploration based on repositories of precompiled
cores that are assembled of common building blocks. For
demonstration purposes we map algorithms for image
processing and financial mathematics to DSEs and
compare the performance to existing highly optimized
signal and graphics accelerators.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Itturiet:2014:APE,
author = "F{\'a}bio Itturiet and Gabriel Nazar and Ronaldo
Ferreira and {\'A}lvaro Moreira and Luigi Carro",
title = "Adaptive Parallelism Exploitation under Physical and
Real-Time Constraints for Resilient Systems",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "25:1--25:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2556943",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article introduces the resilient adaptive
algebraic architecture that aims at adapting
parallelism exploitation of a matrix multiplication
algorithm in a time-deterministic fashion to reduce
power consumption while meeting real-time deadlines
present in most DSP-like applications. The proposed
architecture provides low-overhead error correction
capabilities relying on the hardware implementation of
the algorithm-based fault-tolerance method that is
executed concurrently with matrix multiplication,
providing efficient occupation of memory and power
resources. The Resilient Adaptive Algebraic
Architecture (RA$^3$ ) is evaluated using three
real-time industrial case studies from the telecom and
multimedia application domains to present the design
space exploration and the adaptation possibilities the
architecture offers to hardware designers. RA$^3$ is
compared in its performance and energy efficiency with
standard high-performance architectures, namely a GPU
and an out-of-order general-purpose processor. Finally,
we present the results of fault injection campaigns in
order to measure the architecture resilience to soft
errors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lam:2014:EFA,
author = "Siew-Kei Lam and Christopher T. Clarke and
Thambipillai Srikanthan",
title = "Exploiting {FPGA}-Aware Merging of Custom Instructions
for Runtime Reconfiguration",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "26:1--26:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2655240",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Runtime reconfiguration is a promising solution for
reducing hardware cost in embedded systems, without
compromising on performance. We present a framework
that aims to increase the performance benefits of
reconfigurable processors that support full or partial
runtime reconfiguration. The proposed framework
achieves this by: (1) providing a means for choosing
suitable custom instruction selection heuristics, (2)
leveraging FPGA-aware merging of custom instructions to
maximize the reconfigurable logic block utilization in
each configuration, and (3) incorporating a
hierarchical loop partitioning strategy to reduce
runtime reconfiguration overhead. We show that the
performance gain can be improved by employing suitable
custom instruction selection heuristics that, in turn,
depend on the reconfigurable resource constraints and
the merging factor (extent to which the selected custom
instructions can be merged). The hierarchical loop
partitioning strategy leads to an average performance
gain of over 31\% and 46\% for full and partial runtime
reconfiguration, respectively. Performance gain can be
further increased to over 52\% and 70\% for full and
partial runtime reconfiguration, respectively, by
exploiting FPGA-aware merging of custom instructions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Guillet:2014:EUM,
author = "S{\'e}bastien Guillet and Florent de Lamotte and
Nicolas le Griguer and {\'E}ric Rutten and Guy Gogniat
and Jean-Philippe Diguet",
title = "Extending {UML\slash MARTE} to Support Discrete
Controller Synthesis, Application to Reconfigurable
Systems-on-Chip Modeling",
journal = j-TRETS,
volume = "7",
number = "3",
pages = "27:1--27:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629628",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Sep 1 10:42:23 MDT 2014",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents the first framework to design
and synthesize a formal controller managing dynamic
reconfiguration, using a model-driven engineering
methodology based on an extension of UML/MARTE. The
implementation technique highlights the combination of
hard configuration constraints using weights ( control
part )-ensured statically and fulfilled by the system
at runtime-and soft constraints ( decision part ) that,
given a set of correct and accessible configurations,
choose one of them. An application model of an image
processing application is presented, then transformed
and synthesized to be executed on a Xilinx platform to
show how the controller, executed on a Microblaze,
manages the hardware reconfigurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Butler:2015:HSH,
author = "Jon T. Butler and Tsutomu Sasao",
title = "High-Speed Hardware Partition Generation",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629472",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We demonstrate circuits that generate set and integer
partitions on a set S of n objects at a rate of one per
clock. Partitions are ways to group elements of a set
together and have been extensively studied by
researchers in algorithm design and theory. We offer
two versions of a hardware set partition generator. In
the first, partitions are produced in lexicographical
order in response to successive clock pulses. In the
second, an index input determines the set partition
produced. Such circuits are useful in the hardware
implementation of the optimum distribution of tasks to
processors. We show circuits for integer partitions as
well. Our circuits are combinational. For large n, they
can have a large delay. However, one can easily
pipeline them to produce one partition per clock
period. We show (1) analytical and (2) experimental
time/complexity results that quantify the efficiency of
our designs. For example, our results show that a
hardware set partition generator running on a 100MHz
FPGA produces partitions at a rate that is
approximately 10 times the rate of a software
implementation on a processor running at 2.26GHz.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Paulino:2015:RAB,
author = "Nuno Paulino and Jo{\~a}o Canas Ferreira and Jo{\~a}o
M. P. Cardoso",
title = "A Reconfigurable Architecture for Binary Acceleration
of Loops with Memory Accesses",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629468",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents a reconfigurable
hardware/software architecture for binary acceleration
of embedded applications. A Reconfigurable Processing
Unit (RPU) is used as a coprocessor of the General
Purpose Processor (GPP) to accelerate the execution of
repetitive instruction sequences called Megablocks. A
toolchain detects Megablocks from instruction traces
and generates customized RPU implementations. The
implementation of Megablocks with memory accesses uses
a memory-sharing mechanism to support concurrent
accesses to the entire address space of the GPP's data
memory. The scheduling of load/store operations and
memory access handling have been optimized to minimize
the latency introduced by memory accesses. The system
is able to dynamically switch the execution between the
GPP and the RPU when executing the original binaries of
the input application. Our proof-of-concept prototype
achieved geometric mean speedups of 1.60$ \times $ and
1.18$ \times $ for, respectively, a set of 37
benchmarks and a subset considering the 9 most complex
benchmarks. With respect to a previous version of our
approach, we achieved geometric mean speedup
improvements from 1.22 to 1.53 for the 10 benchmarks
previously used.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dhawan:2015:AEN,
author = "Udit Dhawan and Andr{\'e} Dehon",
title = "Area-Efficient Near-Associative Memories on {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629471",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Associative memories can map sparsely used keys to
values with low latency but can incur heavy area
overheads. The lack of customized hardware for
associative memories in today's mainstream FPGAs
exacerbates the overhead cost of building these
memories using the fixed address match BRAMs. In this
article, we develop a new, FPGA-friendly, memory system
architecture based on a multiple hash scheme that is
able to achieve near-associative performance without
the area-delay overheads of a fully associative memory
on FPGAs. At the same time, we develop a novel memory
management algorithm that allows us to statistically
mimic an associative memory. Using the proposed
architecture as a 64KB L1 data cache, we show that it
is able to achieve near-associative miss rates while
consuming 3--13 $ \times $ fewer FPGA memory resources
for a set of benchmark programs from the SPEC CPU2006
suite than fully associative memories generated by the
Xilinx Coregen tool. Benefits for our architecture
increase with key width, allowing area reduction up to
100 $ \times $. Mapping delay is also reduced to 3.7ns
for a 1,024-entry flat version or 6.1ns for an
area-efficient version compared to 17.6ns for a fully
associative memory for a 64-bit key on a Xilinx Virtex
6 device.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Llamocca:2015:DEP,
author = "Daniel Llamocca and Marios Pattichis",
title = "Dynamic Energy, Performance, and Accuracy Optimization
and Management Using Automatically Generated
Constraints for Separable {$2$D} {FIR} Filtering for
Digital Video Processing",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629623",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "There is strong interest in the development of
dynamically reconfigurable systems that can meet
real-time constraints on energy, performance, and
accuracy. The generation of real-time constraints will
significantly expand the applicability of dynamically
reconfigurable systems to new domains, such as digital
video processing. We develop a dynamically
reconfigurable 2D FIR filtering system that can meet
real-time constraints in energy, performance, and
accuracy (EPA). The real-time constraints are
automatically generated based on user input, image
types associated with video communications, and video
content. We first generate a set of Pareto-optimal
realizations, described by their EPA values and
associated 2D FIR hardware description bitstreams.
Dynamic management is then achieved by selecting
Pareto-optimal realizations that meet the automatically
generated time-varying EPA constraints. We validate our
approach using three different 2D Gaussian filters.
Filter realizations are evaluated in terms of the
required energy per frame, accuracy of the resulting
image, and performance in frames per second. We
demonstrate dynamic EPA management by applying a
Difference of Gaussians (DOG) filter to standard video
sequences. For video frame sizes that are equal to or
larger than the VGA resolution, compared to a static
implementation, our dynamic system provides significant
reduction in the total energy consumption
({$>$30}\%).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gojman:2015:GLG,
author = "Benjamin Gojman and Sirisha Nalmela and Nikil Mehta
and Nicholas Howarth and Andr{\'e} Dehon",
title = "{GROK-LAB}: Generating Real On-chip Knowledge for
Intra-cluster Delays Using Timing Extraction",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "5:1--5:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2597889",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Timing Extraction identifies the delay of fine-grained
components within an FPGA. From these computed delays,
the delay of any path can be calculated. Moreover, a
comparison of the fine-grained delays allows a detailed
understanding of the amount and type of process
variation that exists in the FPGA. To obtain these
delays, Timing Extraction measures, using only
resources already available in the FPGA, the delay of a
small subset of the total paths in the FPGA. We apply
Timing Extraction to the Logic Array Block (LAB) on an
Altera Cyclone III FPGA to obtain a view of the delay
down to near-individual LUT SRAM cell granularity,
characterizing components with delays on the order of
tens to a few hundred picoseconds with a resolution of
$ \pm {}3.2 $ ps, matching the expected error bounds.
This information reveals that the 65nm process used
has, on average, random variation of $ \sigma \mu = 4.0
\% $ with components having an average maximum spread
of 83ps. Timing Extraction also shows that as $ V_{DD}
$ decreases from 1.2V to 0.9V in a Cyclone IV 60nm
FPGA, paths slow down, and variation increases from $
\sigma \mu = 4.3 \% $ to $ \sigma \mu = 5.8 \% $, a
clear indication that lowering $ V_{DD} $ magnifies the
impact of random variation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mahram:2015:NBH,
author = "Atabak Mahram and Martin C. Herbordt",
title = "{NCBI BLASTP} on High-Performance Reconfigurable
Computing Systems",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "6:1--6:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629691",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The BLAST sequence alignment program is a central
application in bioinformatics. The de facto standard
version, NCBI BLAST, uses complex heuristics that make
it challenging to simultaneously achieve both high
performance and exact agreement. We propose a system
that uses novel FPGA-based filters that reduce the
input database by over 99.97\% without loss of
sensitivity. There are several contributions. First is
design of the filters themselves, which perform two-hit
seeding, exhaustive ungapped alignment, and exhaustive
gapped alignments, respectively. Second is the coupling
of the filters, especially the two-hit seeding and the
ungapped alignment. Third is pipelining the filters in
a single design, including maintaining load balancing
as data are reduced by orders of magnitude at each
stage. Fourth is the optimization required to maintain
operating frequency for the resulting complex design.
And finally, there is system integration both in
hardware (the Convey HC1-EX) and software (NCBI
BLASTP). We present results for various usage scenarios
and find complete agreement and a factor of nearly 5x
speedup over a fully parallel implementation of the
reference code on a contemporaneous CPU. We believe
that the resulting system is the leading
per-socket-accelerated NCBI BLAST.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Swierczynski:2015:PSE,
author = "Pawel Swierczynski and Amir Moradi and David Oswald
and Christof Paar",
title = "Physical Security Evaluation of the Bitstream
Encryption Mechanism of {Altera Stratix II} and
{Stratix III} {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "7:1--7:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629462",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "To protect Field-Programmable Gate Array (FPGA)
designs against Intellectual Property (IP) theft and
related issues such as product cloning, all major FPGA
manufacturers offer a mechanism to encrypt the
bitstream that is used to configure the FPGA. From a
mathematical point of view, the employed encryption
algorithms (e.g., Advanced Encryption Standard (AES) or
3DES) are highly secure. However, it has been shown
that the bitstream encryption feature of several FPGA
families is susceptible to side-channel attacks based
on measuring the power consumption of the cryptographic
module. In this article, we present the first
successful attack on the bitstream encryption of the
Altera Stratix II and Stratix III FPGA families. To
this end, we analyzed the Quartus II software and
reverse engineered the details of the proprietary and
unpublished schemes used for bitstream encryption on
Stratix II and Stratix III. Using this knowledge, we
demonstrate that the full 128-bit AES key of a Stratix
II as well as the full 256-bit AES key of a Stratix III
can be recovered by means of side-channel attacks. In
both cases, the attack can be conducted in a few hours.
The complete bitstream of these FPGAs that are
(seemingly) protected by the bitstream encryption
feature can hence fall into the hands of a competitor
or criminal-possibly implying system-wide damage if
confidential information such as proprietary encryption
schemes or secret keys programmed into the FPGA are
extracted. In addition to lost IP, reprogramming the
attacked FPGA with modified code, for instance, to
secretly plant a hardware Trojan, is a particularly
dangerous scenario for many security-critical
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Vliegen:2015:SRD,
author = "Jo Vliegen and Nele Mentens and Ingrid Verbauwhede",
title = "Secure, Remote, Dynamic Reconfiguration of {FPGAs}",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "8:1--8:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629423",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "With the widespread availability of broadband
Internet, Field-Programmable Gate Arrays (FPGAs) can
get remote updates in the field. This provides hardware
and software updates, and enables issue solving and
upgrade ability without device modification. In order
to prevent an attacker from eavesdropping or
manipulating the configuration data, security is a
necessity. This work describes an architecture that
allows the secure, remote reconfiguration of an FPGA.
The architecture is partially dynamically
reconfigurable and it consists of a static partition
that handles the secure communication protocol and a
single reconfigurable partition that holds the main
application. Our solution distinguishes itself from
existing work in two ways: it provides entity
authentication and it avoids the use of a trusted third
party. The former provides protection against active
attackers on the communication channel, while the
latter reduces the number of reliable entities.
Additionally, this work provides basic countermeasures
against simple power-oriented side-channel analysis
attacks. The result is an implementation that is
optimized toward minimal resource occupation. Because
configuration updates occur infrequently, configuration
speed is of minor importance with respect to area. A
prototype of the proposed design is implemented, using
5,702 slices and having minimal downtime.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chau:2015:MAP,
author = "Thomas C. P. Chau and Xinyu Niu and Alison Eele and
Jan Maciejowski and Peter Y. K. Cheung and Wayne Luk",
title = "Mapping Adaptive Particle Filters to Heterogeneous
Reconfigurable Systems",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "9:1--9:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629469",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents an approach for mapping
real-time applications based on particle filters (PFs)
to heterogeneous reconfigurable systems, which
typically consist of multiple FPGAs and CPUs. A method
is proposed to adapt the number of particles
dynamically and to utilise runtime reconfigurability of
FPGAs for reduced power and energy consumption. A data
compression scheme is employed to reduce communication
overhead between FPGAs and CPUs. A mobile robot
localisation and tracking application is developed to
illustrate our approach. Experimental results show that
the proposed adaptive PF can reduce up to 99\% of
computation time. Using runtime reconfiguration, we
achieve a 25\% to 34\% reduction in idle power. A 1U
system with four FPGAs is up to 169 times faster than a
single-core CPU and 41 times faster than a 1U CPU
server with 12 cores. It is also estimated to be 3
times faster than a system with four GPUs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Miller:2015:GBA,
author = "Bailey Miller and Frank Vahid and Tony Givargis and
Philip Brisk",
title = "Graph-Based Approaches to Placement of Processing
Element Networks on {FPGAs} for Physical Model
Simulation",
journal = j-TRETS,
volume = "7",
number = "4",
pages = "10:1--10:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629521",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Feb 13 07:24:19 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Physical models utilize mathematical equations to
characterize physical systems like airway mechanics,
neuron networks, or chemical reactions. Previous work
has shown that field programmable gate arrays (FPGAs)
execute physical models efficiently. To improve the
implementation of physical models on FPGAs, this
article leverages graph theoretic techniques to
synthesize physical models onto FPGAs. The first phase
maps physical model equations onto a structured virtual
processing element (PE) graph using graph theoretic
folding techniques. The second phase maps the
structured virtual PE graph onto physical PE regions on
an FPGA using graph embedding theory. A simulated
annealing algorithm is introduced that can map any
physical model onto an FPGA regardless of the model's
underlying topology. We further extend the simulated
annealing approach by leveraging existing graph drawing
algorithms to generate the initial placement. Compared
to previous work on physical model implementation on
FPGAs, embedding increases clock frequency by 25\% on
average (for applicable topologies), whereas simulated
annealing increases frequency by 13\% on average. The
embedding approach typically produces a circuit whose
frequency is limited by the FPGA clock instead of
routing. Additionally, complex models that could not
previously be routed due to complexity were made
routable when using placement constraints.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{DiCarlo:2015:SSA,
author = "Stefano {Di Carlo} and Giulio Gambardella and Paolo
Prinetto and Daniele Rolfo and Pascal Trotta",
title = "{SATTA}: a {Self-Adaptive Temperature-Based TDF
Awareness} Methodology for Dynamically Reconfigurable
{FPGAs}",
journal = j-TRETS,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2659001",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 7 16:45:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Dependability issues due to nonfunctional properties
are emerging as a major cause of faults in modern
digital systems. Effective countermeasures have to be
developed to properly manage their critical timing
effects. This article presents a methodology to avoid
transition delay faults in field-programmable gate
array (FPGA)-based systems, with low area overhead. The
approach is able to exploit temperature information and
aging characteristics to minimize the cost in terms of
performances degradation and power consumption. The
architecture of a hardware manager able to avoid delay
faults is presented and analyzed extensively, as well
as its integration in the standard implementation
design flow.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cooke:2015:TAF,
author = "Patrick Cooke and Jeremy Fowers and Greg Brown and
Greg Stitt",
title = "A Tradeoff Analysis of {FPGAs}, {GPUs}, and Multicores
for Sliding-Window Applications",
journal = j-TRETS,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2659000",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 7 16:45:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The increasing usage of hardware accelerators such as
Field-Programmable Gate Arrays (FPGAs) and Graphics
Processing Units (GPUs) has significantly increased
application design complexity. Such complexity results
from a larger design space created by numerous
combinations of accelerators, algorithms, and hw/sw
partitions. Exploration of this increased design space
is critical due to widely varying performance and
energy consumption for each accelerator when used for
different application domains and different use cases.
To address this problem, numerous studies have
evaluated specific applications across different
architectures. In this article, we analyze an important
domain of applications, referred to as sliding-window
applications, implemented on FPGAs, GPUs, and multicore
CPUs. For each device, we present optimization
strategies and analyze use cases where each device is
most effective. The results show that, for large input
sizes, FPGAs can achieve speedups of up to $ 5.6 \times
$ and $ 58 \times $ compared to GPUs and multicore
CPUs, respectively, while also using up to an order of
magnitude less energy. For small input sizes and
applications with frequency-domain algorithms, GPUs
generally provide the best performance and energy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Quinn:2015:CFE,
author = "Heather Quinn and Diane Roussel-Dupre and Mike Caffrey
and Paul Graham and Michael Wirthlin and Keith Morgan
and Anthony Salazar and Tony Nelson and Will Howes and
Eric Johnson and Jon Johnson and Brian Pratt and Nathan
Rollins and Jim Krone",
title = "The {Cibola Flight Experiment}",
journal = j-TRETS,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629556",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 7 16:45:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Over the past 15 years many organizations have
researched the use of Static-Random Access Memory
(SRAM)-based Field-Programmable Gate Arrays (FPGAs) in
space. Although the components can provide a
performance improvement over radiation-hardened
processing components, random soft errors can occur
from the naturally occurring space radiation
environment. Many organizations have been developing
methods for characterizing, emulating, and simulating
radiation-induced events; mitigating and removing
radiation-induced computational errors; and designing
fault-tolerant reconfigurable spacecraft. Los Alamos
National Laboratory has fielded one of the longest
space-based FPGAs experiments, called the Cibola Flight
Experiment (CFE), using Xilinx Virtex FPGAs. CFE has
successfully deployed commercial SRAM FPGAs into a
low-Earth orbit with Single-Event Upset (SEU)
mitigation and was able to exploit effectively the
reconfigurability and customization of FPGAs in a harsh
radiation environment. Although older than current
state-of-the-art FPGAs, these same concepts are used to
deploy newer FPGA-based space systems since the launch
of the CFE satellite and will continue to be useful for
newer systems. In this article, we present how the
system was designed to be fault tolerant, prelaunch
predictions of expected on-orbit behaviors, and
on-orbit results.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Davidson:2015:IDC,
author = "Tom Davidson and Elias Vansteenkiste and Karel Heyse
and Karel Bruneel and Dirk Stroobandt",
title = "Identification of Dynamic Circuit Specialization
Opportunities in {RTL} Code",
journal = j-TRETS,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629640",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 7 16:45:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Dynamic Circuit Specialization (DCS) optimizes a
Field-Programmable Gate Array (FPGA) design by assuming
a set of its input signals are constant for a
reasonable amount of time, leading to a smaller and
faster FPGA circuit. When the signals actually change,
a new circuit is loaded into the FPGA through runtime
reconfiguration. The signals the design is specialized
for are called parameters. For certain designs,
parameters can be selected so the DCS implementation is
both smaller and faster than the original
implementation. However, DCS also introduces an
overhead that is difficult for the designer to take
into account, making it hard to determine whether a
design is improved by DCS or not. This article presents
extensive results on a profiling methodology that
analyses Register-Transfer Level (RTL) implementations
of applications to check if DCS would be beneficial. It
proposes to use the functional density as a measure for
the area efficiency of an implementation, as this
measure contains both the overhead and the gains of a
DCS implementation. The first step of the methodology
is to analyse the dynamic behaviour of signals in the
design, to find good parameter candidates. The overhead
of DCS is highly dependent on this dynamic behaviour. A
second stage calculates the functional density for each
candidate and compares it to the functional density of
the original design. The profiling methodology resulted
in three implementations of a profiling tool, the
DCS-RTL profiler. The execution time, accuracy, and the
quality of each implementation is assessed based on
data from 10 RTL designs. All designs, except for the
two 16-bit adaptable Finite Impulse Response (FIR)
filters, are analysed in 1 hour or less.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Iturbe:2015:MAH,
author = "Xabier Iturbe and Khaled Benkrid and Chuan Hong and
Ali Ebrahim and Raul Torrego and Tughrul Arslan",
title = "Microkernel Architecture and Hardware Abstraction
Layer of a Reliable Reconfigurable Real-Time Operating
System {(R3TOS)}",
journal = j-TRETS,
volume = "8",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629639",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 7 16:45:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents a new solution for easing the
development of reconfigurable applications using
Field-Programable Gate Arrays (FPGAs). Namely, our
Reliable Reconfigurable Real-Time Operating System
(R3TOS) provides OS-like support for partially
reconfigurable FPGAs. Unlike related works, R3TOS is
founded on the basis of resource reusability and
computation ephemerality. It makes intensive use of
reconfiguration at very fine FPGA granularity, keeping
the logic resources used only while performing
computation and releasing them as soon as it is
completed. To achieve this goal, R3TOS goes beyond the
traditional approach of using reconfigurable slots with
fixed boundaries interconnected by means of a static
communication infrastructure. Instead, R3TOS approaches
a static route-free system where nearly everything is
reconfigurable. The tasks are concatenated to form a
computation chain through which partial results
naturally flow, and data are exchanged among remotely
located tasks using FPGA's reconfiguration mechanism or
by means of ``removable'' routing circuits. In this
article, we describe the R3TOS microkernel architecture
as well as its hardware abstraction services and
programming interface. Notably, the article presents a
set of novel circuits and mechanisms to overcome the
limitations and exploit the opportunities of Xilinx
reconfigurable technology in the scope of hardware
multitasking and dependability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shi:2015:IDD,
author = "Kan Shi and David Boland and George A.
Constantinides",
title = "Imprecise Datapath Design: an Overclocking Approach",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "6:1--6:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629527",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we describe an alternative circuit
design methodology when considering trade-offs between
accuracy, performance, and silicon area. We compare two
different approaches that could trade accuracy for
performance. One is the traditional approach where the
precision used in the datapath is limited to meet a
target latency. The other is a proposed new approach
which simply allows the datapath to operate without
timing closure. We demonstrate analytically and
experimentally that on average our approach obtains
either smaller errors or equivalent faster operating
frequencies in comparison to the traditional approach.
This is because the worst case caused by timing
violations only happens rarely, while precision loss
results in errors to most data. We also show that for
basic arithmetic operations such as addition, applying
our approach to the simple building block of ripple
carry adders can achieve better accuracy or performance
than using faster adder designs to achieve similar
latency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Woods:2015:PDP,
author = "Louis Woods and Gustavo Alonso and Jens Teubner",
title = "Parallelizing Data Processing on {FPGAs} with Shifter
Lists",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "7:1--7:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629551",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Parallelism is currently seen as a mechanism to
minimize the impact of the power and heat dissipation
problems encountered in modern hardware. Data
parallelism-based on partitioning the data-and pipeline
parallelism-based on partitioning the computation-are
the two main approaches to leverage parallelism on a
wide range of hardware platforms. Unfortunately, not
all data processing problems are susceptible to either
of those strategies. An example is the skyline operator
[B{\"o}rzs{\"o}nyi et al. 2001], which computes the set
of Pareto-optimal points within a multidimensional
dataset. Existing approaches to parallelize the skyline
operator are based on data parallelism. As a result,
they suffer from a high overhead when merging
intermediate results because of the lack of a global
view of the problem inherent to partitioning the input
data. In this article, we show how to combine pipeline
with data parallelism on a Field-Programmable Gate
Array (FPGA) for a more efficient utilization of the
available hardware parallelism. As we show in our
experiments, skyline computation using our proposed
technique scales linearly with the number of processing
elements, and the performance we achieve on a rather
small FPGA is comparable to that of a 64-core high-end
server running a state-of-the-art data parallel
implementation of skyline [Park et al. 2009]. The
proposed approach to parallelize the skyline operator
can be generalized to a wider range of data processing
problems. We demonstrate this through a novel, highly
parallel data structure, a shifter list, that can be
efficiently implemented on an FPGA. The resulting
template is easy to parametrize to implement a variety
of computationally intensive operators such as frequent
items, n -closest pairs, or K-means.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cardoso:2015:GEF,
author = "Jo{\~a}o M. P. Cardoso and Pedro C. Diniz and
Katherine (Compton) Morrow",
title = "Guest Editorial: {FPL 2013}",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "8:1--8:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2737805",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ferreira:2015:RFP,
author = "Ricardo Ferreira and Luciana Rocha and Andr{\'e} G.
Santos and Jos{\'e} A. M. Nacif and Stephan Wong and
Luigi Carro",
title = "A Runtime {FPGA} Placement and Routing Using
Low-Complexity Graph Traversal",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "9:1--9:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2660775",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Dynamic Partial Reconfiguration (DPaR) enables
efficient allocation of logic resources by adding new
functionalities or by sharing and/or multiplexing
resources over time. Placement and routing (P\&R) is
one of the most time-consuming steps in the DPaR flow.
P\&R are two independent NP-complete problems, and,
even for medium size circuits, traditional P\&R
algorithms are not capable of placing and routing
hardware modules at runtime. We propose a novel runtime
P\&R algorithm for Field-Programmable Gate Array
(FPGA)-based designs. Our algorithm models the FPGA as
an implicit graph with a direct correspondence to the
target FPGA. The P\&R is performed as a graph mapping
problem by exploring the node locality during a
depth-first traversal. We perform the P\&R using a
greedy heuristic that executes in polynomial time.
Unlike state-of-the-art algorithms, our approach does
not try similar solutions, thus allowing the P\&R to
execute in milliseconds. Our algorithm is also suitable
for P\&R in fragmented regions. We generate results for
a manufacturer-independent virtual FPGA. Compared with
the most popular P\&R tool running the same benchmark
suite, our algorithm is up to three orders of magnitude
faster.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Murray:2015:TDT,
author = "Kevin E. Murray and Scott Whitty and Suya Liu and
Jason Luu and Vaughn Betz",
title = "Timing-Driven {Titan}: Enabling Large Benchmarks and
Exploring the Gap between Academic and Commercial
{CAD}",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "10:1--10:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629579",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Benchmarks play a key role in Field-Programmable Gate
Array (FPGA) architecture and CAD research, enabling
the quantitative comparison of tools and architectures.
It is important that these benchmarks reflect modern
large-scale systems that make use of heterogeneous
resources; however, most current FPGA benchmarks are
both small and simple. In this artile, we present
Titan, a hybrid CAD flow that addresses these issues.
The flow uses Altera's Quartus II FPGA CAD software to
perform HDL synthesis and a conversion tool to
translate the result into the academic Berkeley Logic
Interchange Format (BLIF). Using this flow, we created
the Titan23 benchmark set, which consists of 23 large
(90K--1.8M block) benchmark circuits covering a wide
range of application domains. Using the Titan23
benchmarks and an enhanced model of Altera's Stratix IV
architecture, including a detailed timing model, we
compare the performance and quality of VPR and Quartus
II targeting the same architecture. We found that VPR
is at least $ 2.8 \times $ slower, uses $ 6.2 \times $
more memory, $ 2.2 \times $ more wire, and produces
critical paths $ 1.5 \times $ slower compared to
Quartus II. Finally, we identified that VPR's focus on
achieving a dense packing and an inability to take
apart clusters is responsible for a large portion of
the wire length and critical path delay gap.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gan:2015:SGA,
author = "Lin Gan and Haohuan Fu and Wayne Luk and Chao Yang and
Wei Xue and Xiaomeng Huang and Youhui Zhang and
Guangwen Yang",
title = "Solving the Global Atmospheric Equations through
Heterogeneous Reconfigurable Platforms",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "11:1--11:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629581",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "One of the most essential and challenging components
in climate modeling is the atmospheric model. To solve
multiphysical atmospheric equations, developers have to
face extremely complex stencil kernels that are costly
in terms of both computing and memory resources. This
article aims to accelerate the solution of global
shallow water equations (SWEs), which is one of the
most essential equation sets describing atmospheric
dynamics. We first design a hybrid methodology that
employs both the host CPU cores and the
field-programmable gate array (FPGA) accelerators to
work in parallel. Through a careful adjustment of the
computational domains, we achieve a balanced resource
utilization and a further improvement of the overall
performance. By decomposing the resource-demanding SWE
kernel, we manage to map the double-precision algorithm
into three FPGAs. Moreover, by using fixed-point and
reduced-precision floating point arithmetic, we manage
to build a fully pipelined mixed-precision design on a
single FPGA, which can perform 428 floating-point and
235 fixed-point operations per cycle. The
mixed-precision design with four FPGAs running together
can achieve a speedup of 20 over a fully optimized
design on a CPU rack with two eight-core processors and
is 8 times faster than the fully optimized Kepler GPU
design. As for power efficiency, the mixed-precision
design with four FPGAs is 10 times more power efficient
than a Tianhe-1A supercomputer node.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Das:2015:ASE,
author = "Anup Das and Shyamsundar Venkataraman and Akash
Kumar",
title = "Autonomous Soft-Error Tolerance of {FPGA}
Configuration Bits",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "12:1--12:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629580",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Field-programmable gate arrays (FPGAs) are
increasingly susceptible to radiation-induced single
event upsets (SEUs). These upsets are predominant in a
space environment; however, with increasing use of
static RAM (SRAM) in modern FPGAs, these SEUs are
gaining prominence even in a terrestrial environment.
SEUs can flip SRAM bits of FPGA, potentially altering
the functionality of the implemented design. This has
motivated FPGA designers to investigate techniques to
protect the FPGA configuration bits against such
inadvertent bit flips (soft error). Traditionally,
triple modular redundancy (TMR) is used to protect the
FPGA bit flips. Increasing design complexity and
limited battery life motivate for alternative
approaches for soft-error tolerance. In this article,
we propose a technique to improve autonomous
fault-masking capabilities of a design by maximizing
the number of zeros or ones in lookup tables (LUTs).
The technique analyzes critical configuration bits and
utilizes spare resources (XOR gates and carry chains)
of FPGAs to selectively manipulate the logic
implemented in LUTs using two operations: LUT
restructuring and LUT decomposition. We implemented the
proposed approach for Xilinx Virtex-6 FPGAs and
validated the same with a wide set of designs from the
MCNC, IWLS 2005, and ITC99 benchmark suites. Results
demonstrate that the proposed logic restructuring
maximizes logic 0 (or 1) of LUTs by an average of 20\%,
achieving 80\% fault masking with no area overhead. The
fault rate of the entire design is reduced by 60\% on
average as compared to the existing techniques.
Furthermore, the logic decomposition algorithm provides
incremental fault-tolerance capabilities and achieves
an additional 5\% fault masking with an average 7\%
increase in slice usage. The complete methodology is
implemented into a tool for Xilinx FPGA and is made
available online for the benefit of the research
community. The algorithms are lightweight, and the
whole design flow (including Xilinx Place and Route)
was completed in 75 minutes for the largest benchmark
in the set.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Istvan:2015:HTL,
author = "Zsolt Istv{\'a}n and Gustavo Alonso and Michaela Blott
and Kees Vissers",
title = "A Hash Table for Line-Rate Data Processing",
journal = j-TRETS,
volume = "8",
number = "2",
pages = "13:1--13:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629582",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:20 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "FPGA-based data processing is becoming increasingly
relevant in data centers, as the transformation of
existing applications into dataflow architectures can
bring significant throughput and power benefits.
Furthermore, a tighter integration of computing and
network is appealing, as it overcomes traditional
bottlenecks between CPUs and network interfaces, and
dramatically reduces latency. In this article, we
present the design of a novel hash table, a fundamental
building block used in many applications, to enable
data processing on FPGAs close to the network. We
present a fully pipelined design capable of sustaining
consistent 10Gbps line-rate processing by deploying a
concurrent mechanism to handle hash collisions. We
address additional design challenges such as support
for a broad range of key sizes without stalling the
pipeline through careful matching of lookup time with
packet reception time. Finally, the design is based on
a scalable architecture that can be easily
parameterized to work with different memory types
operating at different access speeds and latencies. We
have tested the proposed hash table in an FPGA-based
memcached appliance implementing a main-memory
key-value store in hardware. The hash table is used to
index 2 million entries in 24GB of external DDR3 DRAM
while sustaining 13 million requests per second, the
maximum packet rate that can be achieved with UDP
packets on a 10Gbps link for this application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Huang:2015:ECO,
author = "Qijing Huang and Ruolong Lian and Andrew Canis and
Jongsok Choi and Ryan Xi and Nazanin Calagar and
Stephen Brown and Jason Anderson",
title = "The Effect of Compiler Optimizations on High-Level
Synthesis-Generated Hardware",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "14:1--14:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629547",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We consider the impact of compiler optimizations on
the quality of high-level synthesis (HLS)-generated
field-programmable gate array (FPGA) hardware. Using an
HLS tool implemented within the state-of-the-art LLVM
compiler, we study the effect of compiler optimizations
on the hardware metrics of circuit area, execution
cycles, FMax, and wall-clock time. We evaluate 56
different compiler optimizations implemented within
LLVM and show that some optimizations significantly
affect hardware quality. Moreover, we show that
hardware quality is also affected by some optimization
parameter values, as well as the order in which
optimizations are applied. We then present a new
HLS-directed approach to compiler optimizations,
wherein we execute partial HLS and profiling at
intermittent points in the optimization process and use
the results to judiciously undo the impact of
optimization passes predicted to be damaging to the
generated hardware quality. Results show that our
approach produces circuits with 16\% better speed
performance, on average, versus using the standard {\tt
-O3} optimization level.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Niu:2015:AEI,
author = "Xinyu Niu and Thomas C. P. Chau and Qiwei Jin and
Wayne Luk and Qiang Liu and Oliver Pell",
title = "Automating Elimination of Idle Functions by Runtime
Reconfiguration",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "15:1--15:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700415",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A design approach is proposed to automatically
identify and exploit runtime reconfiguration
opportunities with optimised resource utilisation by
eliminating idle functions. We introduce
Reconfiguration Data Flow Graph, a hierarchical graph
structure enabling reconfigurable designs to be
synthesised in three steps: function analysis,
configuration organisation, and runtime solution
generation. The synthesised reconfigurable designs are
dynamically evaluated and selected under various
runtime conditions. Three applications-barrier option
pricing, particle filter, and reverse time
migration-are used in evaluating the proposed approach.
The runtime solutions approximate their theoretical
performance by eliminating idle functions and are 1.31
to 2.19 times faster than optimised static designs.
FPGA designs developed with the proposed approach are
up to 43.8 times faster than optimised CPU reference
designs and 1.55 times faster than optimised GPU
designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bhasin:2015:EFB,
author = "Shivam Bhasin and Jean-Luc Danger and Sylvain Guilley
and Wei He",
title = "Exploiting {FPGA} Block Memories for Protected
Cryptographic Implementations",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "16:1--16:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629552",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Modern field programmable gate arrays (FPGAs) are
power packed with features to facilitate designers.
Availability of features like large block memory
(BRAM), digital signal processing cores, and embedded
CPU makes the design strategy of FPGAs quite different
from ASICs. FPGAs are also widely used in
security-critical applications where protection against
known attacks is of prime importance. We focus on
physical attacks that target physical implementations.
To design countermeasures against such attacks, the
strategy for FPGA designers should be different from
that in ASIC. The available features should be
exploited to design compact and strong countermeasures.
In this article, we propose methods to exploit the
BRAMs in FPGAs for designing compact countermeasures.
Internal BRAM can be used to optimize intrinsic
countermeasures such as masking and dual-rail logics,
which otherwise have significant overhead (at least $ 2
\times $) compared to unprotected ones. The
optimizations are applied on a real AES-128
co-processor and tested for area overhead and
resistance on Xilinx Virtex-5 chips. The presented
masking countermeasure has an overhead of only 16\%
when applied on AES. Moreover, the dual-rail precharge
logic (DPL) countermeasure has been optimized to pack
the whole sequential part in the BRAM, hence enhancing
the security. Proper robustness evaluations are
conducted to analyze the optimization in terms of area
and security.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Eusse:2015:CNP,
author = "Juan Fernando Eusse and Christopher Williams and
Rainer Leupers",
title = "{CoEx}: a Novel Profiling-Based Algorithm\slash
Architecture Co-Exploration for {ASIP} Design",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "17:1--17:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629563",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Application-Specific Instruction Set Processors
(ASIPs) provide the adequate performance/efficiency
tradeoff for their particular application domain.
Nevertheless, their design methodologies have stagnated
during the past decade and are still based on a series
of manual and time-consuming iterative steps.
Furthermore, there exists a productivity gap between
the point where an application is given as the target
for processor customization and the time a customized
architecture is available. Therefore, new tools are
required that reduce the number of design iterations
and bridge the aforementioned productivity gap. This
can be achieved by (1) profiling technologies that, by
adapting to the designer's needs, help to gain insight
into application specifications, and (2)
prearchitectural design technologies that give early
yet accurate feedback on the impact of
algorithmic/architectural design decisions. The first
requirement is addressed in this article by proposing
the multigrained profiling approach, which identifies
the profiling needs at each step of ASIP design and
lets the designer tailor the level of detail for
application inspection. CoEx, a practical
implementation of the approach, is also introduced. The
second requirement is addressed by creating a
prearchitectural estimation engine. This engine couples
CoEx reports for an application with an abstract
processor model and generates an estimate of the
achievable performance. Both CoEx and the performance
estimation engine are respectively evaluated for
instrumentation-induced execution overhead and
accuracy. Finally, the development of an ASIP
architecture for an augmented reality computer vision
application is presented. The ASIP achieves a gain of
six times compared to the original application
performance, after being developed in only 2 days.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Das:2015:ETD,
author = "Anup Das and Amit Kumar Singh and Akash Kumar",
title = "Execution Trace-Driven Energy-Reliability Optimization
for Multimedia {MPSoCs}",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "18:1--18:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2665071",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Multiprocessor systems-on-chip (MPSoCs) are becoming a
popular design choice in current and future technology
nodes to accommodate the heterogeneous computing demand
of a multitude of applications enabled on these
platform. Streaming multimedia and other
communication-centric applications constitute a
significant fraction of the application space of these
devices. The mapping of an application on an MPSoC is
an NP-hard problem. This has attracted researchers to
solve this problem both as stand-alone (best-effort)
and in conjunction with other optimization objectives,
such as energy and reliability. Most existing studies
on energy-reliability joint optimization are
static-that is, design time based. These techniques
fail to capture runtime variability such as resource
unavailability and dynamism associated with application
behaviors, which are typical of multimedia
applications. The few studies that consider dynamic
mapping of applications do not consider throughput
degradation, which directly impacts user satisfaction.
This article proposes a runtime technique to analyze
the execution trace of an application modeled as
Synchronous Data Flow Graphs (SDFGs) to determine its
mapping on a multiprocessor system with heterogeneous
processing units for different fault scenarios.
Further, communication energy is minimized for each of
these mappings while satisfying the throughput
constraint. Experiments conducted with synthetic and
real SDFGs demonstrate that the proposed technique
achieves significant improvement with respect to the
state-of-the-art approaches in terms of throughput and
storage overhead with less than 20\% energy overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ren:2015:EFT,
author = "Yu Ren and Leibo Liu and Shouyi Yin and Jie Han and
Shaojun Wei",
title = "Efficient Fault-Tolerant Topology Reconfiguration
Using a Maximum Flow Algorithm",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "19:1--19:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700417",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "With an increasing number of processing elements (PEs)
integrated on a single chip, fault-tolerant techniques
are critical to ensure the reliability of such complex
systems. In current reconfigurable architectures,
redundant PEs are utilized for fault tolerance. In the
presence of faulty PEs, the physical topologies of
various chips may be different, so the concept of
virtual topology from network embedding problem has
been used to alleviate the burden for the operating
systems. With limited hardware resources, how to
reconfigure a system into the most effective virtual
topology such that the maximum repair rate can be
reached presents a significant challenge. In this
article, a new approach using a maximum flow (MF)
algorithm is proposed for an efficient topology
reconfiguration in reconfigurable architectures. In
this approach, topology reconfiguration is converted
into a network flow problem by constructing a directed
graph; the solution is then found by using the MF
algorithm. This approach optimizes the use of spare PEs
with minimal impacts on area, throughput, and delay,
and thus it significantly improves the repair rate of
faulty PEs. In addition, it achieves a polynomial
reconfiguration time. Experimental results show that
compared to previous methods, the MF approach increases
the probability to repair faulty PEs by up to 50\%
using the same redundant resources. Compared to a
fault-free system, the throughput only decreases by
less than 2.5\% and latency increases by less than 4\%.
To consider various types of PEs in a practical
application, a cost factor is introduced into the MF
algorithm. An enhanced approach using a minimum-cost MF
algorithm is further shown to be efficient in the
fault-tolerant reconfiguration of heterogeneous
reconfigurable architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dobai:2015:LLF,
author = "Roland Dobai and Lukas Sekanina",
title = "Low-Level Flexible Architecture with Hybrid
Reconfiguration for Evolvable Hardware",
journal = j-TRETS,
volume = "8",
number = "3",
pages = "20:1--20:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700414",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 19 17:05:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Field-programmable gate arrays (FPGAs) can be
considered to be the most popular and successful
platform for evolvable hardware. They allow one to
establish and later reconfigure candidate solutions.
Recent work in the field of evolvable hardware includes
the use of virtual and native reconfigurations. Virtual
reconfiguration is based on the change of functionality
by hardware components implemented on top of FPGA
resources. Native reconfiguration changes the FPGA
resources directly by means provided by the FPGA
manufacturer. Both of these approaches have their
disadvantages. The virtual reconfiguration is
characterized by lower maximal operational frequency of
the resulting solutions, and the native reconfiguration
is slower. In this work, a hybrid approach is used
merging the advantages while limiting the disadvantages
of the virtual and native reconfigurations. The main
contribution is the new low-level architecture for
evolvable hardware in the new Zynq-7000
all-programmable system-on-chip. The proposed
architecture offers high flexibility in comparison with
other evolvable hardware systems by considering direct
modification of the reconfigurable resources. The
impact of the higher reconfiguration time of the native
approach is limited by the dense placement of the
proposed reconfigurable processing elements. These
processing elements also ensure fast evaluation of
candidate solutions. The proposed architecture is
evaluated by evolutionary design of switching image
filters and edge detectors. The experimental results
demonstrate advantages over the previous approaches
considering the time required for evolution, area
overhead, and flexibility.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kirchgessner:2015:LOF,
author = "Robert Kirchgessner and Alan D. George and Greg
Stitt",
title = "Low-Overhead {FPGA} Middleware for Application
Portability and Productivity",
journal = j-TRETS,
volume = "8",
number = "4",
pages = "21:1--21:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2746404",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Oct 5 08:47:01 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Reconfigurable computing devices such as
field-programmable gate arrays (FPGAs) offer advantages
over fixed-logic CPU and GPU architectures, including
improved performance, superior power efficiency, and
reconfigurability. The challenge of FPGA application
development, however, has limited their acceptance in
high-performance computing and high-performance
embedded computing applications. FPGA development
carries similar difficulties to hardware design,
requiring that developers iterate through
register-transfer level designs with cycle-level
accuracy. Furthermore, the lack of hardware and
software standards between FPGA platforms limits
productivity and application portability, and makes
porting applications between heterogeneous platforms a
time-consuming and often challenging process. Recent
efforts to improve FPGA productivity using high-level
synthesis tools and languages show promise, but
platform support remains limited and typically is left
as a challenge for developers. To address these issues,
we present RC Middleware (RCMW), a novel middleware
that improves productivity and enables application and
tool portability by abstracting away platform-specific
details. RCMW provides an application-centric
development environment, exposing only the resources
and standardized interfaces required by an application,
independent of the underlying platform. We demonstrate
the portability and productivity benefits of RCMW using
four heterogeneous platforms from three vendors. Our
results indicate that RCMW enables application
productivity and improves developer productivity, and
that these benefits are achieved with less than 7\%
performance and 3\% area overhead on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jacobsen:2015:RRI,
author = "Matthew Jacobsen and Dustin Richmond and Matthew
Hogains and Ryan Kastner",
title = "{RIFFA 2.1}: a Reusable Integration Framework for
{FPGA} Accelerators",
journal = j-TRETS,
volume = "8",
number = "4",
pages = "22:1--22:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2815631",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Oct 5 08:47:01 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We present RIFFA 2.1, a reusable integration framework
for Field-Programmable Gate Array (FPGA) accelerators.
RIFFA provides communication and synchronization for
FPGA accelerated applications using simple interfaces
for hardware and software. Our goal is to expand the
use of FPGAs as an acceleration platform by releasing,
as open source, a framework that easily integrates
software running on commodity CPUs with FPGA cores.
RIFFA uses PCI Express (PCIe) links to connect FPGAs to
a CPU's system bus. RIFFA 2.1 supports FPGAs from
Xilinx and Altera, Linux and Windows operating systems,
and allows multiple FPGAs to connect to a single host
PC system. It has software bindings for C/C++, Java,
Python, and Matlab. Tests show that data transfers
between hardware and software can reach 97\% of the
achievable PCIe link bandwidth.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Thomas:2015:THG,
author = "David B. Thomas",
title = "The Table-{Hadamard} {GRNG}: an Area-Efficient {FPGA}
{Gaussian} Random Number Generator",
journal = j-TRETS,
volume = "8",
number = "4",
pages = "23:1--23:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629607",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Oct 5 08:47:01 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Gaussian random number generators (GRNGs) are an
important component in parallel Monte Carlo simulations
using FPGAs, where tens or hundreds of high-quality
Gaussian samples must be generated per cycle using very
few logic resources. This article describes the
Table-Hadamard generator, which is a GRNG designed to
generate multiple streams of random numbers in
parallel. It uses discrete table distributions to
generate pseudo-Gaussian base samples, then a parallel
Hadamard transform to efficiently apply the central
limit theorem. When generating 64 output samples, the
Table-Hadamard requires just 130 slices per generated
sample, which is a third of the resources needed by the
next best technique, while still providing higher
statistical quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jin:2015:MID,
author = "Zheming Jin and Jason D. Bakos",
title = "Memory Interface Design for {$3$D} Stencil Kernels on
a Massively Parallel Memory System",
journal = j-TRETS,
volume = "8",
number = "4",
pages = "24:1--24:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2800788",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Oct 5 08:47:01 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Massively parallel memory systems are designed to
deliver high bandwidth at relatively low clock speed
for memory-intensive applications implemented on
programmable logic. For example, the Convey HC-1
provides 1,024 DRAM banks to each of four FPGAs through
a full crossbar, presenting a peak bandwidth of
76.8GB/s to the user logic. Such highly parallel memory
systems suffer from high latency, and their effective
bandwidth is highly sensitive to access ordering. To
achieve high performance, the user must use a
customized memory interface that combines scheduling,
latency hiding, and data reuse. In this article, we
describe the design of a custom memory interface for 3D
stencil kernels on the Convey HC-1 that incorporates
these features. Experimental results show that the
proposed memory interface achieves a speedup in runtime
of 2.2 for 6-point stencil and 9.5 for 27-point stencil
when compared to a naive memory interface.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tan:2015:SHP,
author = "Guangming Tan and Chunming Zhang and Wendi Wang and
Peiheng Zhang",
title = "{SuperDragon}: a Heterogeneous Parallel System for
Accelerating {$3$D} Reconstruction of Cryo-Electron
Microscopy Images",
journal = j-TRETS,
volume = "8",
number = "4",
pages = "25:1--25:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2740966",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Oct 5 08:47:01 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The data deluge in medical imaging processing requires
faster and more efficient systems. Due to the advance
in recent heterogeneous architecture, there has been a
resurgence in research aimed at domain-specific
accelerators. In this article, we develop an
experimental system SuperDragon for evaluating
acceleration of a single-particle Cryo-electron
microscopy (Cryo-EM) 3D reconstruction package EMAN
through a hybrid of CPU, GPU, and FPGA parallel
architecture. Based on a comprehensive workload
characterization, we exploit multigrained parallelism
in the Cryo-EM 3D reconstruction algorithm and
investigate a proper computational mapping to the
underlying heterogeneous architecture. The package is
restructured with task-level (MPI), thread-level
(OpenMP), and data-level (GPU and FPGA) parallelism.
Especially, the proposed FPGA accelerator is a stream
architecture that emphasizes the importance of
optimizing computing dominated data access patterns.
Besides, the configurable computing streams are
constructed by arranging the hardware modules and
bypassing channels to form a linear deep pipeline.
Compared to the multicore (six-core) program, the GPU
and FPGA implementations achieve speedups of 8.4 and
2.25 times in execution time while improving power
efficiency by factors of 7.2 and 14.2, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Biedermann:2015:SDR,
author = "Alexander Biedermann and Sorin A. Huss and Adeel
Israr",
title = "Safe Dynamic Reshaping of Reconfigurable {MPSoC}
Embedded Systems for Self-Healing and Self-Adaption
Purposes",
journal = j-TRETS,
volume = "8",
number = "4",
pages = "26:1--26:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700416",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Oct 5 08:47:01 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Multiprocessor system-on-chip (MPSoC) architectures
are a huge challenge in embedded system design. This
situation arises from the fact that available MPSoCs
and related designs flows are not tailored to the
specific needs of embedded systems. This work
demonstrates how to provide self-healing properties in
embedded MPSoC design. This is achieved by combining
the features of a generic approach to create
virtualizable MPSoCs out of off-the-shelf embedded
processors with a methodology to derive system
configurations, such as task-processor bindings, which
are optimal in terms of safety and execution time. The
virtualization properties enable a reshaping of the
MPSoC at runtime. Thus, system configurations may be
exchanged rapidly in a dynamic fashion. As a main
result of this work, embedded multiprocessor systems
are introduced, which dynamically adapt to changing
operating conditions, possible module defects, and
internal state changes. We demonstrate the figures of
merit of such reconfigurable MPSoC embedded systems by
means of a complex automotive application scenario
mapped to an FPGA featuring a virtualizable array of
eight soft-core processors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Park:2015:PIC,
author = "Joonseok Park and Pedro C. Diniz",
title = "Program-Invariant Checking for Soft-Error Detection
using Reconfigurable Hardware",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2751563",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "There is an increasing concern about transient errors
in deep submicron processor architectures.
Software-only error detection approaches that exploit
program invariants for silent error detection incur
large execution overheads and are unreliable as state
can be corrupted after invariant checkpoints. In this
article, we explore the use of configurable hardware
structures for the continuous evaluation of high-level
program invariants at the assembly level. We evaluate
the resource requirements and performance of the
proposed predicate-evaluation hardware structures when
integrated with a 32-bit MIPS soft core on a
contemporary reconfigurable hardware device. The
results, for a small set of kernel codes, reveal that
these hardware structures require a very small number
of hardware resources with negligible impact on the
processor core that they are integrated in. Moreover,
the amount of resources is fairly insensitive to the
complexity of the invariants, thus making the proposed
structures an attractive alternative to software-only
predicate checking.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Scicluna:2015:AMF,
author = "Neil Scicluna and Christos-Savvas Bouganis",
title = "{ARC 2014}: a Multidimensional {FPGA}-Based Parallel
{DBSCAN} Architecture",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2724722",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Clustering large numbers of data points is a very
computationally demanding task that often needs to be
accelerated in order to be useful in practical
applications. This work focuses on the Density-Based
Spatial Clustering of Applications with Noise (DBSCAN)
algorithm, which is one of the state-of-the-art
clustering algorithms, and targets its acceleration
using an FPGA device. The article presents an
optimized, scalable, and parameterizable architecture
that takes advantage of the internal memory structure
of modern FPGAs in order to deliver a high-performance
clustering system. Post-synthesis simulation results
show that the developed system can obtain mean speedups
of 31$ \times $ in real-world tests and 202$ \times $
in synthetic tests when compared to state-of-the-art
software counterparts running on a quad-core 3.4GHz
Intel i7-2600k. Additionally, this implementation is
also capable of clustering data with any number of
dimensions without impacting the performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sasdrich:2015:ICS,
author = "Pascal Sasdrich and Tim G{\"u}neysu",
title = "Implementing {Curve25519} for Side-Channel--Protected
Elliptic Curve Cryptography",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700834",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "For security-critical embedded applications Elliptic
Curve Cryptography (ECC) has become the predominant
cryptographic system for efficient key agreement and
digital signatures. However, ECC still involves complex
modular arithmetic that is a particular burden for
small processors. In this context, Bernstein proposed
the highly efficient ECC instance Curve25519 that
particularly enables efficient software implementations
at a security level comparable to AES-128 with inherent
resistance to simple power analysis (SPA) and timing
attacks. In this work, we show that Curve25519 is
likewise competitive on FPGAs even when countermeasures
to thwart side-channel power analysis are included. Our
basic multicore DSP-based architectures achieves a
maximal performance of more than 32,000 point
multiplications per second on a Xilinx Zynq 7020 FPGA.
Including a mix of side-channel countermeasures to
impede simple and differential power analysis, we still
achieve more than 27,500 point multiplications per
second with a moderate increase in logic resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2015:EAR,
author = "Jianfeng Zhang and Paul Chow and Hengzhu Liu",
title = "An Enhanced Adaptive Recoding Rotation {CORDIC}",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2812813",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/elefunt.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The Conventional Coordinate Rotation Digital Computer
(CORDIC) algorithm has been widely used in many
applications, particularly in Direct Digital Frequency
Synthesizers (DDS) and Fast Fourier Transforms (FFT).
However, CORDIC is constrained by the excessive number
of iterations, angle data path, and scaling factor
compensation. In this article, an enhanced adaptive
recoding CORDIC (EARC) is proposed. It uses the
enhanced adaptive recoding method to reduce the
required iterations and adopts the trigonometric
transformation scheme to scale up the rotation angles.
Computing sine and cosine is used first to compare the
core functionality of EARC with basic CORDIC; then a
16-bit DDS and a 1,024-point FFT based on EARC are
evaluated to demonstrate the benefits of EARC in larger
applications. All the proposed architectures are
validated on a Virtex 5 FPGA development platform.
Compared with a commercial implementation of CORDIC,
EARC requires 33.3\% less hardware resources, provides
a twofold speedup, dissipates 70.4\% less power, and
improves accuracy in terms of the Bit Error Position
(BEP). Compared to the state-of-the-art Hybrid CORDIC,
EARC reduces latency by 11.1\% and consumes 17\% less
power. Compared with a commercial implementation of
DDS, the dissipated power of the proposed DDS is
reduced by 27.2\%. The proposed DDS improves
Spurious-Free Dynamic Range (SFDR) by nearly 7 dBc and
dissipates 21.8\% less power when compared with a
recently published DDS circuit. The FFT based on EARC
dissipates a factor of 2.05 less power than the
commercial FFT even when choosing the 100\% toggle rate
for the FFT based on EARC and the 12.5\% toggle rate
for the commercial FFT. Compared with a recently
published FFT, the FFT based on EARC improves
Signal-to-Noise Ratio (SNR) by 8.9 dB and consumes
7.78\% less power.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Goehringer:2015:GEA,
author = "Diana Goehringer and Marco D. Santambrogio and
Jo{\~a}o M. P. Cardoso and Koen Bertels",
title = "Guest Editorial: {ARC 2014}",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "5:1--5:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2831431",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Heyse:2015:IRL,
author = "Karel Heyse and Jente Basteleus and Brahim {Al Farisi}
and Dirk Stroobandt and Oliver Kadlcek and Oliver
Pell",
title = "On the Impact of Replacing Low-Speed Configuration
Buses on {FPGAs} with the Chip's Internal Configuration
Infrastructure",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "6:1--6:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700835",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "It is common for large hardware designs to have a
number of registers or memories whose contents have to
be changed very seldom (e.g., only at startup). The
conventional way of accessing these memories is through
a low-speed memory bus. This bus uses valuable hardware
resources, introduces long global connections, and
contributes to routing congestion. Hence, it has an
impact on the overall design even though it is only
rarely used. A Field-Programmable Gate Array (FPGA)
already contains a global communication mechanism in
the form of its configuration infrastructure. In this
article, we evaluate the use of the configuration
infrastructure as a replacement for a low-speed memory
bus on the Maxeler HPC platform. We find that by
removing the conventional low-speed memory bus, the
maximum clock frequency of some applications can be
improved by 8\%. Improvements by 25\% and more are also
attainable, but constraints of the Xilinx
reconfiguration infrastructure prevent fully exploiting
these benefits at the moment. We present a number of
possible changes to the Xilinx reconfiguration
infrastructure and tools that would solve this and make
these results more widely applicable.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Duarte:2015:ACK,
author = "Rui Policarpo Duarte and Christos-Savvas Bouganis",
title = "{ARC 2014} Over-Clocking {KLT} Designs on {FPGAs}
under Process, Voltage, and Temperature Variation",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "7:1--7:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818380",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Karhunen-Loeve Transformation is a widely used
algorithm in signal processing that often implemented
with high-throughput requisites. This work presents a
novel methodology to optimise KLT designs on FPGAs that
outperform typical design methodologies, through a
prior characterisation of the arithmetic units in the
datapath of the circuit under various operating
conditions. Limited by the ever-increasing process
variation, the delay models available in synthesis
tools are no longer suitable for extreme performance
optimisation of designs, and as they are generic, they
need to consider the worst-case performance for a given
fabrication process. Hence, they heavily penalise the
maximum possible achieved performance of a design by
leaving safety margin. This work presents a novel
unified optimisation framework which contemplates a
prior characterisation of the embedded multipliers on
the target FPGA device under process, voltage, and
temperature variation. The proposed framework allows a
design space exploration leading to designs without any
latency overheads that achieve high throughput while
producing less errors than typical methodologies,
operating with the same throughput. Experimental
results demonstrate that the proposed methodology
outperforms the typical implementation in three
real-life design strategies: high performance, low
power, and temperature variation; and it produced
circuit designs that performed up to 18dB better when
over-clocked.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bai:2015:ATF,
author = "Yuhui Bai and Syed Zahid Ahmed and Bertrand Granado",
title = "{ARC 2014}: Towards a Fast {FPGA} Implementation of a
Heap-Based Priority Queue for Image Coding Using a
Parallel Index-Aware Tree",
journal = j-TRETS,
volume = "9",
number = "1",
pages = "8:1--8:??",
month = nov,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2766454",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The embedded image processing systems like smartphones
and digital cameras have tight limits on storage,
computation power, network connectivity, and battery
usage. These limitations make it important to ensure
efficient image coding. In the article, we present a
novel heap-based priority queue structure employed by
an Adaptive Scanning of Wavelet Data scheme (ASWD)
targeting an embedded platform. ASWD is a context
modeling block implemented via priority queues in a
wavelet-based image coder to reorganize the wavelet
coefficients into locally stationary sequences. The
architecture we propose exploits efficient use of
FPGA's on-chip dual-port memories in an adaptive
manner. Innovations of index-aware system linked to
each element in the queue makes the location of queue
element traceable in the heap as per the requirements
of the ASWD algorithm. Moreover, use of 4-port memories
along with intelligent data concatenation of queue
elements yielded in a cost effective enhanced memory
access. The memory ports are adaptively assigned to
different units during different processing phases in a
manner to optimally take advantage of memory access
required by that phase. The architectural innovations
can also be exploited in other applications that
require efficient hardware implementations of generic
priority queue or classical sorting applications which
sort into the index. We designed and validated the
hardware on an Altera's Stratix IV FPGA as an IP
accelerator in a Nios II processor based System on
Chip. We show that our architecture at 150MHz can
provide 45X speedup compared to an embedded ARM
Cortex-A9 processor at 666MHz targeting the throughput
of 10MB/s.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2016:CBE,
author = "Jianfeng Zhang and Paul Chow and Hengzhu Liu",
title = "{CORDIC}-Based Enhanced Systolic Array Architecture
for {$ Q R $} Decomposition",
journal = j-TRETS,
volume = "9",
number = "2",
pages = "9:1--9:??",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2827700",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:57 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Multiple input multiple output (MIMO) with orthogonal
frequency division multiplexing (OFDM) systems
typically use orthogonal-triangular (QR) decomposition.
In this article, we present an enhanced systolic array
architecture to realize QR decomposition based on the
Givens rotation (GR) method for a 4 $ \times $ 4 real
matrix. The coordinate rotation digital computer
(CORDIC) algorithm is adopted and modified to speed up
and simplify the process of GR. To verify the function
and evaluate the performance, the proposed
architectures are validated on a Virtex 5 FPGA
development platform. Compared to a commercial
implementation of vectoring CORDIC, the enhanced
vectoring CORDIC is presented that uses 37.7\% less
hardware resources, dissipates 71.6\% less power, and
provides a 1.8 times speedup while maintaining the same
computation accuracy. The enhanced QR systolic array
architecture based on the enhanced vectoring CORDIC
saves 24.5\% in power dissipation, provides a factor of
1.5-fold improvement in throughput, and the hardware
efficiency is improved 1.45-fold with no accuracy
penalty when compared to our previously proposed QR
systolic array architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Winterstein:2016:SLH,
author = "Felix J. Winterstein and Samuel R. Bayliss and George
A. Constantinides",
title = "Separation Logic for High-Level Synthesis",
journal = j-TRETS,
volume = "9",
number = "2",
pages = "10:1--10:??",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2836169",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:57 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "High-Level Synthesis (HLS) promises a significant
shortening of the FPGA design cycle by raising the
abstraction level of the design entry to high-level
languages such as C/C++. However, applications using
dynamic, pointer-based data structures and dynamic
memory allocation remain difficult to implement well,
yet such constructs are widely used in software.
Automated optimizations that leverage the memory
bandwidth of FPGAs by distributing the application data
over separate banks of on-chip memory are often
ineffective in the presence of dynamic data structures
due to the lack of an automated analysis of
pointer-based memory accesses. In this work, we take a
step toward closing this gap. We present a static
analysis for pointer-manipulating programs that
automatically splits heap-allocated data structures
into disjoint, independent regions. The analysis
leverages recent advances in separation logic, a
theoretical framework for reasoning about
heap-allocated data that has been successfully applied
in recent software verification tools. Our algorithm
focuses on dynamic data structures accessed in loops
and is accompanied by automated source-to-source
transformations that enable automatic loop
parallelization and memory partitioning by
off-the-shelf HLS tools. We demonstrate the successful
loop parallelization and memory partitioning by our
tool flow using three real-life applications that
build, traverse, update, and dispose of dynamically
allocated data structures. Our case studies, comparing
the automatically parallelized to the direct HLS
implementations, show an average latency reduction by a
factor of 2 $ \times $ across our benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Xu:2016:CGA,
author = "Jinwei Xu and Jingfei Jiang and Yong Dou and Xiaolong
Shen and Zhiqiang Liu",
title = "Coarse-Grained Architecture for Fingerprint Matching",
journal = j-TRETS,
volume = "9",
number = "2",
pages = "12:1--12:??",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2791296",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:57 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Fingerprint matching is a key procedure in fingerprint
identification applications. The minutiae-based
fingerprint matching algorithm is one of the most
typical algorithms achieving a reasonably correct
recognition rate. This study proposes a coarse-grained
parallel architecture called fingerprint matching core
(FMC) to accelerate fingerprint matching. The proposed
architecture has a two-level parallel structure (i.e.,
parallel among groups (PAG) and parallel in group
(PIG)). A multirequest controller is added to the PAG
structure to obtain a concurrent operation of the
multiple processing element group (PEG). The DDR3
controller is used in the PIG structure to read eight
minutiae from eight different fingerprints and realize
the simultaneous computation of the eight PEs. The
whole system is implemented on a Xilinx FPGA board with
a Virtex VII XC7VX485T chip. The 16-PEG FMC achieves a
throughput of about 9.63 million fingerprint pairs per
second, which is larger than that achieved on a Tesla
K20c platform. The software execution times are also
measured on the 2.93GHz Intel Xeon 5670, 2.3GHz AMD
Opteron(tm) Processor 6376, and Tesla K20c platforms.
The Intel Xeon 5670 has two processors with 12 cores,
and the AMD Opteron(tm) Processor 6376 has two
processors with 16 cores. Moreover, the throughput is
about 31 times that achieved on a 2.93GHz Intel Xeon
5670 single core.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zaidi:2016:VSF,
author = "Ali Mustafa Zaidi and David Greaves",
title = "Value State Flow Graph: a Dataflow Compiler {IR} for
Accelerating Control-Intensive Code in Spatial
Hardware",
journal = j-TRETS,
volume = "9",
number = "2",
pages = "14:1--14:??",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2807702",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:57 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Although custom (and reconfigurable) computing can
provide orders-of-magnitude improvements in energy
efficiency and performance for many numeric,
data-parallel applications, performance on nonnumeric,
sequential code is often worse than conventional
superscalar processors. This work attempts to improve
sequential performance in custom hardware by (a)
switching from a statically scheduled to a dynamically
scheduled (dataflow) execution model and (b) developing
a new compiler IR for high-level synthesis-the value
state flow graph (VSFG)-that enables aggressive
exposition of ILP even in the presence of complex
control flow. Compared to existing control-data flow
graph (CDFG)-based IRs, the VSFG exposes more
instruction-level parallelism from control-intensive
sequential code by exploiting aggressive speculation,
enabling control dependence analysis, as well as
execution along multiple flows of control. This new IR
is directly implemented as a static-dataflow graph in
hardware by our prototype high-level synthesis tool
chain and shows an average speedup of 1.13$ \times $
over equivalent hardware generated using LegUp, an
existing CDFG-based HLS tool. Furthermore, the VSFG
allows us to further trade area and energy for
performance through loop unrolling, increasing the
average speedup to 1.55$ \times $, with a peak speedup
of 4.05$ \times $. Our VSFG-based hardware approaches
the sequential cycle counts of an Intel Nehalem Core i7
processor while consuming only 0.25$ \times $ the
energy of an in-order Altera Nios II f processor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Raitza:2016:RRN,
author = "Michael Raitza and Markus Vogt and Christian
Hochberger and Thilo Pionteck",
title = "{RAW 2014}: Random Number Generators on {FPGAs}",
journal = j-TRETS,
volume = "9",
number = "2",
pages = "15:1--15:??",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2807699",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:57 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Random numbers are important ingredients in a number
of applications. Especially in a security context, they
must be well distributed and unpredictable. We
investigate the practical use of random number
generators (RNGs) that are built from digital elements
found in FPGAs. For this, we implement different types
of ring oscillators (ROs) and memory collision-based
circuits on FPGAs from major vendors. Implementing RNGs
on the same device as the rest of the system benefits
an overall reduction of vulnerability to attacks and
wire tapping. Nevertheless, we investigate different
attacks by tampering with power supply, chip
temperature, and by exposition to strong magnetic
fields and X-radiation. We also consider their
usability as massively deployed components, whose
functionality cannot be tested individually anymore, by
conducting a technology invariance experiment. Our
experiments show that BlockRAM-based RNGs cannot be
considered as a suitable entropy source. We further
show that RO-based RNGs work reliably under a wide
range of operating conditions. While magnetic fields
and X-rays did not induce any notable change, voltage
and temperature variations caused an increase in
propagation delays within the circuits. We show how
reliable RNGs can be constructed and deployed on
FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Attia:2016:RAD,
author = "Osama G. Attia and Kevin R. Townsend and Phillip H.
Jones and Joseph Zambreno",
title = "A Reconfigurable Architecture for the Detection of
Strongly Connected Components",
journal = j-TRETS,
volume = "9",
number = "2",
pages = "16:1--16:??",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2807700",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Dec 22 16:19:57 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The Strongly Connected Components (SCCs) detection
algorithm serves as a keystone for many graph analysis
applications. The SCC execution time for large-scale
graphs, as with many other graph algorithms, is
dominated by memory latency. In this article, we
investigate the design of a parallel hardware
architecture for the detection of SCCs in directed
graphs. We propose a design methodology that alleviates
memory latency and problems with irregular memory
access. The design is composed of 16 processing
elements dedicated to parallel Breadth-First Search
(BFS) and eight processing elements dedicated to
finding intersection in parallel. Processing elements
are organized to reuse resources and utilize memory
bandwidth efficiently. We demonstrate a prototype of
our design using the Convey HC-2 system, a commercial
high-performance reconfigurable computing coprocessor.
Our experimental results show a speedup of as much as
17$ \times $ for detecting SCCs in large-scale graphs
when compared to a conventional sequential software
implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kapre:2016:OSV,
author = "Nachiket Kapre",
title = "Optimizing Soft Vector Processing in {FPGA}-Based
Embedded Systems",
journal = j-TRETS,
volume = "9",
number = "3",
pages = "17:1--17:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2912884",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jul 14 16:35:43 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Soft vector processors can augment and extend the
capability of FPGA-based embedded systems-on-chip such
as the Xilinx Zynq. However, configuring and optimizing
the soft processor for best performance is hard. We
must consider architectural parameters such as
precision, vector lane count, vector length, chunk
size, and DMA scheduling to ensure efficient execution
of code on the soft vector processing platform. To
simplify the design process, we develop a compiler
framework and an autotuning runtime that splits the
optimization into a combination of static and dynamic
passes that map data-parallel computations to the soft
processor. We compare and contrast implementations
running on the scalar ARM processor, the embedded NEON
hard vector engine, and low-level streaming Verilog
designs with the VectorBlox MXP soft vector processor.
Across a range of data-parallel benchmarks, we show
that the MXP soft vector processor can outperform other
organizations by up to $ 4 \times $ while saving $
\approx 10 \% $ dynamic power. Our compilation and
runtime framework is also able to outperform the gcc
NEON vectorizer under certain conditions by explicit
generation of NEON intrinsics and performance tuning of
the autogenerated data-parallel code. When constrained
by IO bandwidth, soft vector processors are even
competitive with spatial Verilog implementations of
computation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dehon:2016:ISI,
author = "Andr{\'e} Dehon and Derek Chiou",
title = "Introduction to Special Issue on Reconfigurable
Components with Source Code",
journal = j-TRETS,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2907949",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jul 14 16:35:43 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fang:2016:OSV,
author = "Xin Fang and Miriam Leeser",
title = "Open-Source Variable-Precision Floating-Point Library
for Major Commercial {FPGAs}",
journal = j-TRETS,
volume = "9",
number = "3",
pages = "1--17",
month = jul,
year = "2016",
DOI = "https://doi.org/10.1145/2851507",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Feb 8 10:53:20 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2851507",
abstract = "There is increased interest in implementing
floating-point designs for different precisions that
take advantage of the flexibility offered by
Field-Programmable Gate Arrays (FPGAs). In this
article, we present updates to the Variable-precision
FLOATing Point Library (VFLOAT) developed at
Northeastern University and highlight recent
improvements in implementations for implementing
reciprocal, division, and square root components that
scale to double precision for FPGAs from the two major
vendors: Altera and Xilinx. Our library is open source
and flexible and provides the user with many options. A
designer has many tradeoffs to consider including clock
frequency, total latency, and resource usage as well as
target architecture. We compare the generated cores to
those produced by each vendor and to another popular
open-source tool: FloPoCo. VFLOAT has the advantage of
not tying the user s design to a specific target
architecture and of providing the maximum flexibility
for all options including clock frequency and latency
compared to other alternatives. Our results show that
variable-precision as well as double-precision designs
can easily be accommodated and the resulting components
are competitive and in many cases superior to the
alternatives.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wilson:2016:UAA,
author = "David Wilson and Greg Stitt",
title = "The Unified Accumulator Architecture: a Configurable,
Portable, and Extensible Floating-Point Accumulator",
journal = j-TRETS,
volume = "9",
number = "3",
pages = "21:1--21:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809432",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jul 14 16:35:43 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Applications accelerated by field-programmable gate
arrays (FPGAs) often require pipelined floating-point
accumulators with a variety of different trade-offs.
Although previous work has introduced numerous
floating-point accumulation architectures, few cores
are available for public use, which forces designers to
use fixed-point implementations or vendor-provided
cores that are not portable and are often not optimized
for the desired set of trade-offs. In this article, we
combine and extend previous floating-point accumulator
architectures into a configurable, open-source core,
referred to as the unified accumulator architecture
(UAA), which enables designers to choose between
different trade-offs for different applications. UAA is
portable across FPGAs and allows designers to
specialize the underlying adder core to take advantage
of device-specific optimizations. By providing an
extensible, open-source implementation, we hope for the
research community to extend the provided core with new
architectures and optimizations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Abdelhadi:2016:MSM,
author = "Ameer M. S. Abdelhadi and Guy G. F. Lemieux",
title = "Modular Switched Multiported {SRAM}-Based Memories",
journal = j-TRETS,
volume = "9",
number = "3",
pages = "22:1--22:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2851506",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jul 14 16:35:43 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Multiported RAMs are essential for high-performance
parallel computation systems. VLIW and vector
processors, CGRAs, DSPs, CMPs, and other processing
systems often rely upon multiported memories for
parallel access. Although memories with a large number
of read and write ports are important, their high
implementation cost means that they are used sparingly.
As a result, FPGA vendors only provide dual-ported
block RAMs (BRAMs) to handle the majority of usage
patterns. Furthermore, recent attempts to create
FPGA-based multiported memories suffer from low storage
utilization. Whereas most approaches provide simple
unidirectional ports with a fixed read or write, others
propose true bidirectional ports where each port
dynamically switches read and write. True RAM ports are
useful for systems with transceivers and provide high
RAM flexibility; however, this flexibility incurs high
BRAM consumption. In this article, a novel, modular,
and BRAM-based switched multiported RAM architecture is
proposed. In addition to unidirectional ports with
fixed read/write, this switched architecture allows a
group of write ports to switch with another group of
read ports dynamically, hence altering the number of
active ports. The proposed switched-ports architecture
is less flexible than a true-multiported RAM where each
port is switched individually. Nevertheless, switched
memories can dramatically reduce BRAM consumption
compared to true ports for systems with alternating
port requirements. Previous live-value-table (LVT) and
XOR approaches are merged and optimized into a
generalized and modular structure that we call an
invalidation-based live-value-table (I-LVT). Like a
regular LVT, the I-LVT determines the correct bank to
read from, but it differs in how updates to the table
are made; the LVT approach requires multiple write
ports, often leading to an area-intensive
register-based implementation, whereas the XOR approach
suffers from excessive storage overhead since wider
memories are required to accommodate the XOR-ed data.
Two specific I-LVT implementations are proposed and
evaluated: binary and thermometer coding. The I-LVT
approach is especially suitable for deep memories
because the table is implemented only in SRAM cells.
The I-LVT method gives higher performance while
occupying fewer BRAMs than earlier approaches: for
several configurations, BRAM usage is reduced by
greater than 44\% and clock speed is improved by
greater than 76\%. The I-LVT can be used with fixed
ports, true ports, or the proposed switched ports
architectures. Formal proofs for the suggested methods,
resources consumption analysis, usage guidelines, and
analytic comparison to other methods are provided. A
fully parameterized Verilog implementation is released
as an open source library. The library has been
extensively tested using Altera's EDA tools.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Stitt:2016:PSW,
author = "Greg Stitt and Eric Schwartz and Patrick Cooke",
title = "A Parallel Sliding-Window Generator for
High-Performance Digital-Signal Processing on {FPGAs}",
journal = j-TRETS,
volume = "9",
number = "3",
pages = "23:1--23:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2800789",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jul 14 16:35:43 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Sliding-window applications, an important class of the
digital-signal processing domain, are highly amenable
to pipeline parallelism on field-programmable gate
arrays (FPGAs). Although memory bandwidth often
restricts parallelism for many applications,
sliding-window applications can leverage custom
buffers, referred to as sliding-window generators, that
provide massive input bandwidth that far exceeds the
capabilities of external memory. Previous work has
introduced a variety of sliding-window generators, but
those approaches typically generate at most one window
per cycle, which significantly restricts parallelism.
In this article, we address this limitation with a
parallel sliding-window generator that can generate a
configurable number of windows every cycle. Although in
practice the number of parallel windows is limited by
memory bandwidth, we show that even with common
bandwidth limitations, the presented generator enables
near-linear speedups up to 16x faster than previous
FPGA studies that generate a single window per cycle,
which were already in some cases faster than
graphics-processing units and microprocessors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ul-Abdin:2016:RCF,
author = "Zain Ul-Abdin and Bertil Svensson",
title = "A Retargetable Compilation Framework for Heterogeneous
Reconfigurable Computing",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "24:1--24:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2843946",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The future trend in microprocessors for the more
advanced embedded systems is focusing on massively
parallel reconfigurable architectures, consisting of
heterogeneous ensembles of hundreds of processing
elements communicating over a reconfigurable
interconnection network. However, the mastering of
low-level microarchitectural details involved in the
programming of such massively parallel platforms
becomes too cumbersome, which limits their adoption in
many applications. Thus, there is a dire need for an
approach to produce high-performance scalable
implementations that harness the computational
resources of the emerging reconfigurable platforms.
This article addresses the grand challenge of
accessibility of these diverse reconfigurable platforms
by suggesting the use of a high-level language,
occam-pi, and developing a complete design flow for
building, compiling, and generating machine code for
heterogeneous coarse-grained hardware. We have
evaluated the approach by implementing complex
industrial case studies and three common signal
processing algorithms. The results of the implemented
case studies suggest that the occam-pi language-based
approach, because of its well-defined semantics for
expressing concurrency and reconfigurability,
simplifies the development of applications employing
runtime reconfigurable devices. The associated compiler
framework ensures portability as well as the
performance benefits across heterogeneous platforms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ziener:2016:FBD,
author = "Daniel Ziener and Florian Bauer and Andreas Becher and
Christopher Dennl and Klaus Meyer-Wegener and Ute
Sch{\"u}rfeld and J{\"u}rgen Teich and J{\"o}rg-Stephan
Vogt and Helmut Weber",
title = "{FPGA}-Based Dynamically Reconfigurable {SQL} Query
Processing",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "25:1--25:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2845087",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we propose an FPGA-based SQL query
processing approach exploiting the capabilities of
partial dynamic reconfiguration of modern FPGAs. After
the analysis of an incoming query, a query-specific
hardware processing unit is generated on the fly and
loaded on the FPGA for immediate query execution. For
each query, a specialized hardware accelerator pipeline
is composed and configured on the FPGA from a set of
presynthesized hardware modules. These partially
reconfigurable hardware modules are gathered in a
library covering all major SQL operations like
restrictions and aggregations, as well as more complex
operations such as joins and sorts. Moreover, this
holistic query processing approach in hardware supports
different data processing strategies including row- as
column-wise data processing in order to optimize data
communication and processing. This article gives an
overview of the proposed query processing methodology
and the corresponding library of modules. Additionally,
a performance analysis is introduced that is able to
estimate the processing time of a query for different
processing strategies and different communication and
processing architecture configurations. With the help
of this performance analysis, architectural bottlenecks
may be exposed and future optimized architectures,
besides the two prototypes presented here, may be
determined.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Matthews:2016:SMM,
author = "Eric Matthews and Lesley Shannon and Alexandra
Fedorova",
title = "Shared Memory Multicore {MicroBlaze} System with {SMP}
{Linux} Support",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "26:1--26:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2870638",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this work, we present PolyBlaze, a scalable and
configurable multicore platform for FPGA-based embedded
systems and systems research. PolyBlaze is an extension
of the MicroBlaze soft processor, leveraging the
configurability of the MicroBlaze and bringing it into
the multicore era with Linux Symmetric Multi-Processor
(SMP) support. This work details the hardware
modifications required for the MicroBlaze processor and
its software stack to enable fully validated SMP
operations, including atomic operation support, shared
interrupts and timers, and exception handling. New in
this work, we present a scalable and flexible memory
hierarchy optimized for Field Programmable Gate Arrays
(FPGAs), which manages atomic operations and provides
support for future flexible memory hierarchies and
heterogeneous systems. Also new is an in-depth analysis
of key performance characteristics, including memory
bandwidth, latency, and resource usage. For all system
configurations, bandwidth is found to scale linearly
with the addition of processor cores until the memory
interface is saturated. Additionally, average memory
latency remains constant until the memory interface is
saturated; after which, it scales linearly with each
additional processor core.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yu:2016:OAH,
author = "Ting Yu and Chris Bradley and Oliver Sinnen",
title = "{ODoST}: Automatic Hardware Acceleration for
Biomedical Model Integration",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "27:1--27:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2870639",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Dynamic biomedical systems are mathematically
described by Ordinary Differential Equations (ODEs) and
their solution is often one of the most computationally
intensive parts in biomedical simulations. With high
inherent parallelism, hardware acceleration based on
Field-Programmable Gate Arrays (FPGAs) has great
potential to increase the computational performance of
the model simulations, while being very
power-efficient. However, the manual hardware
implementation is complex and time consuming. The
advantages of FPGA designs can only be realised if
there is a general solution to automate the process. In
this article, we propose a domain-specific high-level
synthesis tool called ODoST that automatically
generates an FPGA-based Hardware Accelerator Module
(HAM) from a high-level description. In this direct
approach, ODE equations are directly mapped to
processing pipelines without any intermediate
architecture layer of processing elements. We evaluate
the generated HAMs on real hardware based on their
resource usage, processing speed, and power
consumption, and compare them with CPUs and a GPU. The
results show that FPGA implementations can achieve 15.3
times more speedup compared to a single core CPU
solution and perform similarly to an auto-generated GPU
solution, while the FPGA implementations can achieve
14.5 times more power efficiency than the CPU and 3.1
times compared to the optimised GPU solution. Improved
speedups are foreseeable based on further
optimisations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2016:I,
author = "Deming Chen",
title = "Introduction",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "28:1--28:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2955103",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "28",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wegley:2016:ASD,
author = "Evan Wegley and Yanhua Yi and Qinhai Zhang",
title = "Application of Specific Delay Window Routing for
Timing Optimization in {FPGA} Designs",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "29:1--29:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2892640",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In addition to optimizing for long-path timing and
routability, commercial FPGA routing engines must also
optimize for various timing constraints, enabling users
to fine tune their designs. These timing constraints
involve both long- and short-path timing requirements.
The intricacies of commercial FPGA architectures add
difficulty to the problem of supporting such
constraints. In this work, we introduce specific delay
window routing as a general method for optimization
during the routing stage of the FPGA design flow, which
can be applied to various timing constraints
constituting both long- and short-path requirements.
Furthermore, we propose a key adjustment to standard
FPGA routing technology for the purposes of specific
delay window routing. By using dual-wave expansion
instead of traditional single-wave expansion, we solve
the critical issue of inaccurate delay estimation in
our wave search, which would otherwise make routing
according to a specific delay window difficult. Our
results show that this dual-wave method can support
stricter timing constraints than the standard
single-wave method. For a suite of designs with
constraints requiring connections to meet a target
delay within 250ps, our dual-wave method could satisfy
the requirement for all designs, whereas the
single-wave method failed for more than two thirds of
the designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "29",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kadric:2016:IPM,
author = "Edin Kadric and David Lakata and Andr{\'e} Dehon",
title = "Impact of Parallelism and Memory Architecture on
{FPGA} Communication Energy",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "30:1--30:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2857057",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The energy in FPGA computations is dominated by data
communication energy, either in the form of memory
references or data movement on interconnect. In this
article, we explore how to use data placement and
parallelism to reduce communication energy. We show
that parallelism can reduce energy and that the optimal
level of parallelism increases with the problem size.
We further explore how FPGA memory architecture (memory
block size(s), memory banking, and spacing between
memory banks) can impact communication energy, and
determine how to organize the memory architecture to
guarantee that the energy overhead compared to the
optimally matched architecture for the design is never
more than 60\%. We specifically show that an
architecture with 32 bit wide, 16Kb internally banked
memories placed every 8 columns of 10 4-LUT logic
blocks is within 61\% of the optimally matched
architecture across the VTR 7 benchmark set and a set
of parallelism-tunable benchmarks. Without internal
banking, the worst-case overhead is 98\%, achieved with
an architecture with 32 bit wide, 8Kb memories placed
every 9 columns, roughly comparable to the memory
organization on the Cyclone V (where memories are
placed about every 10 columns). Monolithic 32 bit wide,
16Kb memories placed every 10 columns (comparable to
18Kb and 20Kb memories used in Virtex 4 and Stratix V
FPGAs) have a 180\% worst-case energy overhead.
Furthermore, we show practical cases where designs
mapped for optimal parallelism use $ 4.7 \times $ less
energy than designs using a single processing
element.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "30",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rodionov:2016:FGI,
author = "Alex Rodionov and David Biancolin and Jonathan Rose",
title = "Fine-Grained Interconnect Synthesis",
journal = j-TRETS,
volume = "9",
number = "4",
pages = "31:1--31:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2892641",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:08 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "One of the key challenges for the FPGA industry going
forward is to make the task of designing hardware
easier. A significant portion of that design task is
the creation of the interconnect pathways between
functional structures. We present a synthesis tool that
automates this process and focuses on the interconnect
needs in the fine-grained (sub-IP-block) design space.
Here there are several issues that prior research and
tools do not address well: the need to have fixed,
deterministic latency between communicating units (to
enable high-performance local communication without the
area overheads of latency insensitivity), and the
ability to avoid generating unnecessary arbitration
hardware when the application design can avoid it.
Using a design example, our tool generates interconnect
that requires 69\% fewer lines of specification code
than a handwritten Verilog implementation, which is a
32\% overall reduction for the entire application. The
resulting system, while requiring 6\% more total
functional and interconnect area, achieves the same
performance. We also show a quantitative and
qualitative advantages against an existing commercial
interconnect synthesis tool, over which we achieve a
25\% performance advantage and 15\%/57\% logic/memory
area savings.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "31",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wulf:2016:FEO,
author = "Nicholas Wulf and Alan D. George and Ann Gordon-Ross",
title = "A Framework for Evaluating and Optimizing {FPGA}-Based
{SoCs} for Aerospace Computing",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2888400",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "On-board processing systems are often deployed in
harsh aerospace environments and must therefore adhere
to stringent constraints such as low power, small size,
and high dependability in the presence of faults.
Field-programmable gate arrays (FPGAs) are often an
attractive option for designers seeking low-power,
high-performance devices. However, unlike
nonreconfigurable devices, radiation effects can alter
an FPGA's functionality instead of just the device's
data, requiring designers to consider fault-tolerant
strategies to mitigate these effects. In this article,
we present a framework to ease these system design
challenges and aid designers in considering a broad
range of devices and fault-tolerant strategies for
on-board processing, highlighting the most promising
options and tradeoffs early in the design process. This
article focuses on the power, dependability, and
lifetime evaluation metrics, which our framework
calculates and leverages to evaluate the effectiveness
of varying system-on-chip (SoC) designs. Finally, we
use our framework to evaluate SoC designs for a case
study on a hyperspectral-imaging (HSI) mission to
demonstrate our framework's ability to identify
efficient and effective SoC designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Richardson:2016:AFR,
author = "Justin Richardson and Alan George and Kevin Cheng and
Herman Lam",
title = "Analysis of Fixed, Reconfigurable, and Hybrid Devices
with Computational, Memory, {I/O}, \&
Realizable-Utilization Metrics",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2888401",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The modern processor landscape is a varied and diverse
community. As such, developers need a way to quickly
and fairly compare various devices for use with
particular applications. This article expands the
authors' previously published computational-density
metrics and presents an analysis of a new generation of
various device architectures, including CPU, DSP, FPGA,
GPU, and hybrid architectures. Also, new memory metrics
are added to expand the existing suite of metrics to
characterize the memory resources on various processing
devices. Finally, a new relational metric, realizable
utilization (RU), is introduced, which quantifies the
fraction of the computational density metric that an
application achieves within an individual
implementation. The RU metric can be used to provide
valuable feedback to application developers and
architecture designers by highlighting the upper bound
on specific application optimization and providing a
quantifiable measure of theoretical and realizable
performance. Overall, the analysis in this article
quantifies the performance tradeoffs among the
architectures studied, the memory characteristics of
different device types, and the efficiency of device
architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chao:2016:DTM,
author = "Hung-Lin Chao and Sheng-Ya Tung and Pao-Ann Hsiung",
title = "Dynamic Task Mapping with Congestion Speculation for
Reconfigurable Network-on-Chip",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2892633",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Network-on-Chip (NoC) has been proposed as a promising
communication architecture to replace the dedicated
interconnections and shared buses for future embedded
system platforms. In such a parallel platform, mapping
application tasks to the NoC is a key issue because it
affects throughput significantly due to the problem of
communication congestion. Increased communication
latency, low system performance, and low resource
utilization are some side-effects of a bad mapping.
Current mapping algorithms either do not consider link
utilizations or consider only the current utilizations.
Besides, to design an efficient NoC platform, mapping
task to computation nodes and scheduling communication
should be taken into consideration. In this work, we
propose an efficient algorithm for dynamic task mapping
with congestion speculation (DTMCS) that not only
includes the conventional application mapping, but also
further considers future traffic patterns based on the
link utilization. The proposed algorithm can reduce
overall congestion, instead of only improving the
current packet blocking situation. Our experiment
results have demonstrated that compared to the
state-of-the-art congestion-aware Path Load algorithm,
the proposed DTMCS algorithm can reduce up to 40.5\% of
average communication latency, while the maximal
communication latency can be reduced by up to 67.7\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{LeGal:2016:FSM,
author = "Bertrand {Le Gal} and Y{\'e}rom-David Bromberg and
Laurent R{\'e}veill{\`e}re and Jigar Solanki",
title = "A Flexible {SoC} and Its Methodology for Parser-Based
Applications",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "4:1--4:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2939379",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Embedded systems are being increasingly network
interconnected. They are required to interact with
their environment through text-based protocol messages.
Parsing such messages is control dominated. The work
presented in this article attempts to accelerate
message parsers using a codesign-based approach. We
propose a generic architecture associated with an
automated design methodology that enables SoC/SoPC
system generation from high-level specifications of
message protocols. Experimental results obtained on a
Xilinx ML605 board show acceleration factors ranging
from four to 11. Both static and dynamic
reconfigurations of coprocessors are discussed and then
evaluated so as to reduce the system hardware
complexity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Pang:2016:MKR,
author = "Yeyong Pang and Shaojun Wang and Yu Peng and Xiyuan
Peng and Nicholas J. Fraser and Philip H. W. Leong",
title = "A Microcoded Kernel Recursive Least Squares Processor
Using {FPGA} Technology",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2950061",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Kernel methods utilize linear methods in a nonlinear
feature space and combine the advantages of both.
Online kernel methods, such as kernel recursive least
squares (KRLS) and kernel normalized least mean squares
(KNLMS), perform nonlinear regression in a recursive
manner, with similar computational requirements to
linear techniques. In this article, an architecture for
a microcoded kernel method accelerator is described,
and high-performance implementations of sliding-window
KRLS, fixed-budget KRLS, and KNLMS are presented. The
architecture utilizes pipelining and vectorization for
performance, and microcoding for reusability. The
design can be scaled to allow tradeoffs between
capacity, performance, and area. The design is compared
with a central processing unit (CPU), digital signal
processor (DSP), and Altera OpenCL implementations. In
different configurations on an Altera Arria 10 device,
our SW-KRLS implementation delivers floating-point
throughput of approximately 16 GFLOPs, latency of 5.5 $
\mu $ s, and energy consumption of $ 10^{- 4} $ J,
these being improvements over a CPU by factors of 12,
17, and 24, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tang:2016:AKM,
author = "Qing Y. Tang and Mohammed A. S. Khalid",
title = "Acceleration of $k$-Means Algorithm Using {Altera SDK}
for {OpenCL}",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "6:1--6:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2964910",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A K-means clustering algorithm involves partitioning
of data iteratively into k clusters. It is one of the
most popular data-mining algorithms [Wu et al. 2007],
and is widely used in other applications, such as image
processing and machine learning. However, k-means is
highly time-consuming when data or cluster size is
large. Traditionally, FPGAs have shown great promise
for accelerating computationally intensive algorithms,
but they are harder to use for acceleration if we rely
on traditional HD-based design methods. The recent
introduction of Altera SDK for the OpenCL high-level
synthesis tool allows developers to utilize FPGA's
potential without long development periods and
extensive hardware knowledge. This article presents an
optimized implementation of a k-means clustering
algorithm on an FPGA using Altera SDK for OpenCL.
Performance and power consumption is measured with
various data, cluster, and dimension sizes. When
compared to state-of-the-art solutions, this
implementation supports larger cluster sizes, offers up
to 21x speed over a CPU and is more power efficient
than a GPU. Unlike previous implementations, it can
deliver consistently high throughput across large or
small feature dimensions given reasonable cluster sizes
and large enough data size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wong:2016:MCM,
author = "Henry Wong and Vaughn Betz and Jonathan Rose",
title = "Microarchitecture and Circuits for a {200 MHz}
Out-of-Order Soft Processor Memory System",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "7:1--7:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2974022",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Although FPGAs have grown in capacity, FPGA-based soft
processors have grown very little because of the
difficulty of achieving higher performance in exchange
for area. Superscalar out-of-order processors promise
large performance gains, and the memory subsystem is a
key part of such a processor that must help supply
increased performance. In this article, we describe and
explore microarchitectural and circuit-level tradeoffs
in the design of such a memory system. We show the
significant instructions-per-cycle wins for providing
various levels of out-of-order memory access and memory
dependence speculation ($ 1.32 \times $ SPECint2000)
and for the addition of a second-level cache (another $
1.60 \times $ ). With careful microarchitecture and
circuit design, we also achieve a L1 translation
lookaside buffers and cache lookup with 29\% less logic
delay than the simpler Nios II/f memory system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rouhani:2016:ART,
author = "Bita Darvish Rouhani and Azalia Mirhoseini and Ebrahim
M. Songhori and Farinaz Koushanfar",
title = "Automated Real-Time Analysis of Streaming Big and
Dense Data on Reconfigurable Platforms",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "8:1--8:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2974023",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We propose SSketch, a novel automated framework for
efficient analysis of dynamic big data with dense
(non-sparse) correlation matrices on reconfigurable
platforms. SSketch targets streaming applications where
each data sample can be processed only once and storage
is severely limited. Our framework adaptively learns
from the stream of input data and updates a
corresponding ensemble of lower-dimensional data
structures, a.k.a., a sketch matrix. A new sketching
methodology is introduced that tailors the problem of
transforming the big data with dense correlations to an
ensemble of lower-dimensional subspaces such that it is
suitable for hardware-based acceleration performed by
reconfigurable hardware. The new method is scalable,
while it significantly reduces costly memory
interactions and enhances matrix computation
performance by leveraging coarse-grained parallelism
existing in the dataset. SSketch provides an automated
optimization methodology for creating the most accurate
data sketch for a given set of user-defined
constraints, including runtime and power as well as
platform constraints such as memory. To facilitate
automation, SSketch takes advantage of a
Hardware/Software (HW/SW) co-design approach: It
provides an Application Programming Interface that can
be customized for rapid prototyping of an arbitrary
matrix-based data analysis algorithm. Proof-of-concept
evaluations on a variety of visual datasets with more
than 11 million non-zeros demonstrate up to a 200-fold
speedup on our hardware-accelerated realization of
SSketch compared to a software-based deployment on a
general-purpose processor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bourge:2016:GEC,
author = "Alban Bourge and Olivier Muller and Fr{\'e}d{\'e}ric
Rousseau",
title = "Generating Efficient Context-Switch Capable Circuits
through Autonomous Design Flow",
journal = j-TRETS,
volume = "10",
number = "1",
pages = "9:1--9:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996199",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Apr 3 11:34:09 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Commercial off-the-shelf (COTS) Field-Programmable
Gate Arrays (FPGAs) are becoming increasingly powerful.
In addition to their huge hardware resources, they are
also integrated into complete systems on chips (SOCs),
e.g., in the latest Xilinx Zynq or Altera Stratix
platforms. However, cooperation between FPGAs and their
surroundings, and the flexibility of hardware task
management could still be improved. For instance,
mechanisms have yet to be automated to allow multi-user
approaches. A reconfigurable resource can be shared
between applications or users only if it has a
context-switch ability allowing applications to be
paused and resumed in response to system demands. Here,
we present a high-level synthesis (HLS) design flow
producing a context-switch-capable circuit. The design
flow manipulates the intermediate representation of an
HLS tool to build the context extraction mechanism and
to optimize performance for the circuit produced. The
method is based on efficient checkpoint selection and
insertion of a powerful scan-chain into the initial
circuit. This scan-chain can extract flip-flops or
memory content. Experiments with the system produced
show that it has a low hardware overhead for many
benchmark applications, and that the hardware added has
a negligible impact on application performance.
Comparisons with current standard methods highlight the
efficiency of our contributions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cardoso:2017:ISS,
author = "Jo{\~a}o M. P. Cardoso and Cristina Silvano",
title = "Introduction to the Special Section on {FPL 2015}",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "10:1--10:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3041224",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kim:2017:SSC,
author = "Jin Hee Kim and Jason H. Anderson",
title = "Synthesizable Standard Cell {FPGA} Fabrics Targetable
by the {Verilog}-to-Routing {CAD} Flow",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "11:1--11:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3024063",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we consider implementing
field-programmable gate arrays (FPGAs) using a standard
cell design methodology and present a framework for the
automated generation of synthesizable FPGA fabrics. The
open-source Verilog-to-Routing (VTR) FPGA architecture
evaluation framework [Rose et al. 2012] is extended to
generate synthesizable Verilog for its in-memory FPGA
architectural device model. The Verilog can
subsequently be synthesized into standard cells, placed
and routed using an ASIC design flow. A second
extension to VTR generates a configuration bitstream
for the FPGA, where the bitstream configures the FPGA
to realize a user-provided placed and routed design.
The proposed framework and methodology makes possible
the silicon implementation of a wide range of
VTR-modeled FPGA fabrics. In an experimental study,
area and timing-optimized FPGA implementations in 65nm
TSMC standard cells are compared to a 65nm Altera
commercial FPGA. In addition, we consider augmenting
the generic standard-cell library from TSMC with a
manually designed and laid-out FPGA-specific cell. We
demonstrate the utility of the custom cell in reducing
the area of the synthesized FPGA fabric.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Burovskiy:2017:EAH,
author = "Pavel Burovskiy and Paul Grigoras and Spencer Sherwin
and Wayne Luk",
title = "Efficient Assembly for High-Order Unstructured {FEM}
Meshes {(FPL 2015)}",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "12:1--12:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3024064",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The Finite Element Method (FEM) is a common numerical
technique used for solving Partial Differential
Equations on large and unstructured domain geometries.
Numerical methods for FEM typically use algorithms and
data structures which exhibit an unstructured memory
access pattern. This makes acceleration of FEM on
Field-Programmable Gate Arrays using an efficient,
deeply pipelined architecture particularly challenging.
In this work, we focus on implementing and optimising a
vector assembly operation which, in the context of FEM,
induces the unstructured memory access. We propose a
dataflow architecture, graph-based theoretical model,
and design flow for optimising the assembly operation
for spectral/hp finite element method on reconfigurable
accelerators. We evaluate the proposed approach on two
benchmark meshes and show that the graph-theoretic
method of generating a static data access schedule
results in a significant improvement in resource
utilisation compared to prior work. This enables
supporting larger FEM meshes on FPGA than previously
possible.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yang:2017:FSA,
author = "Hsin-Jung Yang and Kermin Fleming and Felix
Winterstein and Michael Adler and Joel Emer",
title = "{(FPL 2015) Scavenger}: Automating the Construction of
Application-Optimized Memory Hierarchies",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "13:1--13:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3009971",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "High-level abstractions separate algorithm design from
platform implementation, allowing programmers to focus
on algorithms while building complex systems. This
separation also provides system programmers and
compilers an opportunity to optimize platform services
on an application-by-application basis. In
field-programmable gate arrays (FPGAs), platform-level
malleability extends to the memory system: Unlike
general-purpose processors, in which memory hardware is
fixed at design time, the capacity, associativity, and
topology of FPGA memory systems may all be tuned to
improve application performance. Since application
kernels may only explicitly use few memory resources,
substantial memory capacity may be available to the
platform for use on behalf of the user program. In this
work, we present Scavenger, which utilizes spare
resources to construct program-optimized memories, and
we also perform an initial exploration of methods for
automating the construction of these
application-specific memory hierarchies. Although
exploiting spare resources can be beneficial,
na{\"\i}vely consuming all memory resources may cause
frequency degradation. To relieve timing pressure in
large block RAM (BRAM) structures, we provide
microarchitectural techniques to trade memory latency
for design frequency. We demonstrate, by examining a
set of benchmarks, that our scalable cache
microarchitecture achieves performance gains of 7\% to
74\% (with a 26\% geometric mean on average) over the
baseline cache microarchitecture when scaling the size
of first-level caches to the maximum.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kapre:2017:HDR,
author = "Nachiket Kapre and Jan Gray",
title = "{Hoplite}: a Deflection-Routed Directional Torus {NoC}
for {FPGAs}",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "14:1--14:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3027486",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We can design an FPGA-optimized lightweight
network-on-chip (NoC) router for flit-oriented
packet-switched communication that is an order of
magnitude smaller (in terms of LUTs and FFs) than
state-of-the-art FPGA overlay routers available today.
We present Hoplite, an efficient, lightweight, and fast
FPGA overlay NoC that is designed to be small and
compact by (1) using deflection routing instead of
buffered switching to eliminate expensive FIFO buffers
and (2) using a torus topology to reduce the cost of
switch crossbar. Buffering and crossbar implementation
complexities have traditionally limited speeds and
imposed heavy resource costs in conventional FPGA
overlay NoCs. We take care to exploit the fracturable
lookup tables (LUT) organization of the FPGA to further
improve the resource efficiency of mapping the
expensive crossbar multiplexers. Hoplite can outperform
classic, bidirectional, buffered mesh networks for
single-flit-oriented FPGA applications by as much as $
1.5 \times $ (best achievable throughputs for a $ 10
\times 10 $ system) or $ 2.5 \times $ (allocating same
amount of FPGA resources to both NoCs) for uniform
random traffic. When compared to buffered mesh
switches, FPGA-based deflection routers are $ \approx
3.5 \times $ smaller (HLS-generated switch) and $ 2.5
\times $ faster (clock period) for 32b payloads. In a
separate experiment, we hand-crafted an RTL version of
our switch with location constraints that requires only
60 LUTs and 100 FFs per router and runs at 2.9ns. We
conduct additional layout experiments on modern Xilinx
and Altera FPGAs and demonstrate wide-channel
chip-spanning layouts that run in excess of 300MHz
while consuming 10--15\% of overall chip resources. We
also demonstrate a clustered RISC-V multiprocessor
organization that uses Hoplite to help deliver the high
processing throughputs of the FPGA architecture to user
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Leong:2017:FYF,
author = "Philip H. W. Leong and Hideharu Amano and Jason
Anderson and Koen Bertels and Jo{\~a}o M. P. Cardoso
and Oliver Diessel and Guy Gogniat and Mike Hutton and
Junkyu Lee and Wayne Luk and Patrick Lysaght and Marco
Platzner and Viktor K. Prasanna and Tero Rissa and
Cristina Silvano and Hayden Kwok-Hay So and Yu Wang",
title = "The First 25 Years of the {FPL} Conference:
Significant Papers",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "15:1--15:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996468",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A summary of contributions made by significant papers
from the first 25 years of the Field-Programmable Logic
and Applications conference (FPL) is presented. The 27
papers chosen represent those which have most strongly
influenced theory and practice in the field.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Takano:2017:PSA,
author = "Shigeyuki Takano",
title = "Performance Scalability of Adaptive Processor
Architecture",
journal = j-TRETS,
volume = "10",
number = "2",
pages = "16:1--16:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007902",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:01 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we evaluate the performance
scalability of architectures called adaptive
processors, which dynamically configure an
application-specific pipelined datapath and perform a
data-flow streaming execution. Previous works have
examined the basics of the following: (1) a
computational model that supports the swap-in/out of a
partial datapath-namely, a virtual hardware is realized
by hardware, without a host processor and its software;
(2) an architecture that has shown a minimum pipeline
requirement and a minimum component requirement; and
(3) the characteristics of the execution phase and a
stack shift that realizes the swap-in/out. However,
these works did not explore the design space,
particularly with respect to the following: (1) the
clock cycle time on the adaptive processor, which must
depend on a wire delay that is primarily used for the
global communication of requests, acknowledgments,
acquirements, releases, and so forth, and (2) a revised
control system that can handle the out-of-order
acknowledgment and in-order acquirement that guarantee
the correct datapath configuration with a conditional
branch for the configurations. This article explores
the scaling of the ALU resources versus pipelining of
the wires.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2017:TOF,
author = "Zhiqiang Liu and Yong Dou and Jingfei Jiang and Jinwei
Xu and Shijie Li and Yongmei Zhou and Yingnan Xu",
title = "Throughput-Optimized {FPGA} Accelerator for Deep
Convolutional Neural Networks",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "17:1--17:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3079758",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Deep convolutional neural networks (CNNs) have gained
great success in various computer vision applications.
State-of-the-art CNN models for large-scale
applications are computation intensive and memory
expensive and, hence, are mainly processed on
high-performance processors like server CPUs and GPUs.
However, there is an increasing demand of high-accuracy
or real-time object detection tasks in large-scale
clusters or embedded systems, which requires
energy-efficient accelerators because of the green
computation requirement or the limited battery
restriction. Due to the advantages of energy efficiency
and reconfigurability, Field-Programmable Gate Arrays
(FPGAs) have been widely explored as CNN accelerators.
In this article, we present an in-depth analysis of
computation complexity and the memory footprint of each
CNN layer type. Then a scalable parallel framework is
proposed that exploits four levels of parallelism in
hardware acceleration. We further put forward a
systematic design space exploration methodology to
search for the optimal solution that maximizes
accelerator throughput under the FPGA constraints such
as on-chip memory, computational resources, external
memory bandwidth, and clock frequency. Finally, we
demonstrate the methodology by optimizing three
representative CNNs (LeNet, AlexNet, and VGG-S) on a
Xilinx VC709 board. The average performance of the
three accelerators is 424.7, 445.6, and 473.4GOP/s
under 100MHz working frequency, which outperforms the
CPU and previous work significantly.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ueno:2017:BCF,
author = "Tomohiro Ueno and Kentaro Sano and Satoru Yamamoto",
title = "Bandwidth Compression of Floating-Point Numerical Data
Streams for {FPGA}-Based High-Performance Computing",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "18:1--18:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3053688",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Although computational performance is often limited by
insufficient bandwidth to/from an external memory, it
is not easy to physically increase off-chip memory
bandwidth. In this study, we propose a hardware-based
bandwidth compression technique that can be applied to
field-programmable gate array-- (FPGA) based
high-performance computation with a logically wider
effective memory bandwidth. Our proposed hardware
approach can boost the performance of FPGA-based stream
computations by applying a data compression technique
to effectively transfer more data streams. To apply
this data compression technique to bandwidth
compression via hardware, several requirements must
first be satisfied, including an acceptable level of
compression performance and a sufficiently small
hardware footprint. Our proposed hardware-based
bandwidth compressor utilizes an efficient
prediction-based data compression algorithm. Moreover,
we propose a multichannel serializer and deserializer
that enable applications to use multiple channels of
computational data with the bandwidth compression. The
serializer encodes compressed data blocks of multiple
channels into a data stream, which is efficiently
written to an external memory. Based on preliminary
evaluation, we define an encoding format considering
both high compression ratio and small hardware area. As
a result, we demonstrate that our area saving bandwidth
compressor increases performance of an FPGA-based fluid
dynamics simulation by deploying more processing
elements to exploit spatial parallelism with the
enhanced memory bandwidth.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Laforest:2017:MCM,
author = "Charles Eric Laforest and Jason H. Anderson",
title = "Microarchitectural Comparison of the {MXP} and
{Octavo} Soft-Processor {FPGA} Overlays",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "19:1--19:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3053679",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Field-Programmable Gate Arrays (FPGAs) can yield
higher performance and lower power than software
solutions on CPUs or GPUs. However, designing with
FPGAs requires specialized hardware design skills and
hours-long CAD processing times. To reduce and
accelerate the design effort, we can implement an
overlay architecture on the FPGA, on which we then more
easily construct the desired system but at a large cost
in performance and area relative to a direct FPGA
implementation. In this work, we compare the
micro-architecture, performance, and area of two
soft-processor overlays: the Octavo multi-threaded
soft-processor and the MXP soft vector processor. To
measure the area and performance penalties of these
overlays relative to the underlying FPGA hardware, we
compare direct FPGA implementations of the
micro-benchmarks written in C synthesized with the
LegUp HLS tool and also written in the Verilog HDL.
Overall, Octavo's higher operating frequency and MXP's
more efficient code execution results in similar
performance from both, within an order of magnitude of
direct FPGA implementations, but with a penalty of an
order of magnitude greater area.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gu:2017:IRF,
author = "Chongyan Gu and Neil Hanley and M{\'a}ire O'neill",
title = "Improved Reliability of {FPGA}-Based {PUF}
Identification Generator Design",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "20:1--20:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3053681",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Physical unclonable functions (PUFs), a form of
physical security primitive, enable digital identifiers
to be extracted from devices, such as field
programmable gate arrays (FPGAs). Many PUF
implementations have been proposed to generate these
unique n -bit binary strings. However, they often offer
insufficient uniqueness and reliability when
implemented on FPGAs and can consume excessive
resources. To address these problems, in this article
we present an efficient, lightweight, and scalable PUF
identification (ID) generator circuit that offers a
compact design with good uniqueness and reliability
properties and is specifically designed for FPGAs. A
novel post-characterisation methodology is also
proposed that improves the reliability of a PUF without
the need for any additional hardware resources.
Moreover, the proposed post-characterisation method can
be generally used for any FPGA-based PUF designs. The
PUF ID generator consumes 8.95\% of the hardware
resources of a low-cost Xilinx Spartan-6 LX9 FPGA and
0.81\% of a Xilinx Artix-7 FPGA. Experimental results
show good uniqueness, reliability, and uniformity with
no occurrence of bit-aliasing. In particular, the
reliability of the PUF is close to 100\% over an
environmental temperature range of 25${}^\circ $C to
70${}^\circ $C with \pm 10\% variation in the supply
voltage.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Prost-Boucle:2017:EVF,
author = "Adrien Prost-Boucle and Fr{\'e}d{\'e}ric P{\'e}trot
and Vincent Leroy and Hande Alemdar",
title = "Efficient and Versatile {FPGA} Acceleration of Support
Counting for Stream Mining of Sequences and Frequent
Itemsets",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "21:1--21:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3027485",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Stream processing has become extremely popular for
analyzing huge volumes of data for a variety of
applications, including IoT, social networks, retail,
and software logs analysis. Streams of data are
produced continuously and are mined to extract patterns
characterizing the data. A class of data mining
algorithm, called generate-and-test, produces a set of
candidate patterns that are then evaluated over data.
The main challenges of these algorithms are to achieve
high throughput, low latency, and reduced power
consumption. In this article, we present a novel
power-efficient, fast, and versatile hardware
architecture whose objective is to monitor a set of
target patterns to maintain their frequency over a
stream of data. This accelerator can be used to
accelerate data-mining algorithms, including itemsets
and sequences mining. The massive fine-grain
reconfiguration capability of field-programmable gate
array (FPGA) technologies is ideal to implement the
high number of pattern-detection units needed for these
intensive data-mining applications. We have thus
designed and implemented an IP that features
high-density FPGA occupation and high working
frequency. We provide detailed description of the IP
internal micro-architecture and its actual
implementation and optimization for the targeted FPGA
resources. We validate our architecture by developing a
co-designed implementation of the Apriori Frequent
Itemset Mining (FIM) algorithm, and perform numerous
experiments against existing hardware and software
solutions. We demonstrate that FIM hardware
acceleration is particularly efficient for large and
low-density datasets (i.e., long-tailed datasets). Our
IP reaches a data throughput of 250 million items/s and
monitors up to 11.6k patterns simultaneously, on a
prototyping board that overall consumes 24W in the
worst case. Furthermore, our hardware accelerator
remains generic and can be integrated to other generate
and test algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tili:2017:RPG,
author = "Ilian Tili and Kalin Ovtcharov and J. Gregory
Steffan",
title = "Reducing the Performance Gap between Soft Scalar
{CPUs} and Custom Hardware with {TILT}",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "22:1--22:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3079757",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "By using resource sharing field-programmable gate
array (FPGA) compute engines, we can reduce the
performance gap between soft scalar CPUs and
resource-intensive custom datapath designs. This
article demonstrates that Thread- and Instruction-Level
parallel Template architecture (TILT), a programmable
FPGA-based horizontally microcoded compute engine
designed to highly utilize floating point (FP)
functional units (FUs), can improve significantly the
average throughput of eight FP-intensive applications
compared to a soft scalar CPU (similar to a FP-extended
Nios). For eight benchmark applications, we show that:
(i) a base TILT configuration having a single instance
for each FU type can improve the performance over a
soft scalar CPU by 15.8 $ \times $ , while requiring on
average 26\% of the custom datapaths' area; (ii)
selectively increasing the number of FUs can more than
double TILT's average throughput, reducing the
custom-datapath-throughput-gap from 576 $ \times $ to
14 $ \times $ ; and (iii) replicated instances of the
most computationally dense TILT configuration that fit
within the area of each custom datapath design can
reduce the gap to 8.27 $ \times $ , while replicated
instances of application-tuned configurations of TILT
can reduce the custom-datapath-throughput-gap to an
average of 5.22 $ \times $ , and up to 3.41 $ \times $
for the Matrix Multiply benchmark. Last, we present
methods for design space reduction, and we correctly
predict the computationally densest design for seven
out of eight benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wulf:2017:OFP,
author = "Nicholas Wulf and Alan D. George and Ann Gordon-Ross",
title = "Optimizing {FPGA} Performance, Power, and
Dependability with Linear Programming",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "23:1--23:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3079756",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Field-programmable gate arrays (FPGA) are an
increasingly attractive alternative to traditional
microprocessor-based computing architectures in
extreme-computing domains, such as aerospace and
supercomputing. FPGAs offer several resource types that
offer different tradeoffs between speed, power, and
area, which make FPGAs highly flexible for varying
application computational requirements. However, since
an application's computational operations can map to
different resource types, a major challenge in
leveraging resource-diverse FPGAs is determining the
optimal distribution of these operations across the
device's available resources for varying FPGA devices,
resulting in an extremely large design space. In order
to facilitate fast design-space exploration, this
article presents a method based on linear programming
(LP) that determines the optimal operation distribution
for a particular device and application with respect to
performance, power, or dependability metrics. Our LP
method is an effective tool for exploring early designs
by quickly analyzing thousands of FPGAs to determine
the best FPGA devices and operation distributions,
which significantly reduces design time. We demonstrate
our LP method's effectiveness with two case studies
involving dot-product and distance-calculation kernels
on a range of Virtex-5 FPGAs. Results show that our LP
method selects optimal distributions of operations to
within an average of 4\% of actual values.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Riebler:2017:EBB,
author = "Heinrich Riebler and Michael Lass and Robert
Mittendorf and Thomas L{\"o}cke and Christian Plessl",
title = "Efficient Branch and Bound on {FPGAs} Using Work
Stealing and Instance-Specific Designs",
journal = j-TRETS,
volume = "10",
number = "3",
pages = "24:1--24:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3053687",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Dec 23 10:23:02 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Branch and bound (B8B) algorithms structure the search
space as a tree and eliminate infeasible solutions
early by pruning subtrees that cannot lead to a valid
or optimal solution. Custom hardware designs
significantly accelerate the execution of these
algorithms. In this article, we demonstrate a
high-performance B8B implementation on FPGAs. First, we
identify general elements of B8B algorithms and
describe their implementation as a finite state
machine. Then, we introduce workers that autonomously
cooperate using work stealing to allow parallel
execution and full utilization of the target FPGA.
Finally, we explore advantages of instance-specific
designs that target a specific problem instance to
improve performance. We evaluate our concepts by
applying them to a branch and bound problem, the
reconstruction of corrupted AES keys obtained from
cold-boot attacks. The evaluation shows that our work
stealing approach is scalable with the available
resources and provides speedups proportional to the
number of workers. Instance-specific designs allow us
to achieve an overall speedup of 47 $ \times $ compared
to the fastest implementation of AES key reconstruction
so far. Finally, we demonstrate how instance-specific
designs can be generated just-in-time such that the
provided speedups outweigh the additional time required
for design synthesis.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gerlein:2017:NCA,
author = "Eduardo A. Gerlein and T. M. Mcginnity and Ammar
Belatreche and Sonya Coleman",
title = "Network on Chip Architecture for Multi-Agent Systems
in {FPGA}",
journal = j-TRETS,
volume = "10",
number = "4",
pages = "25:1--25:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3121112",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 29 07:28:53 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A system of interacting agents is, by definition, very
demanding in terms of computational resources. Although
multi-agent systems have been used to solve complex
problems in many areas, it is usually very difficult to
perform large-scale simulations in their targeted
serial computing platforms. Reconfigurable hardware, in
particular Field Programmable Gate Arrays devices, have
been successfully used in High Performance Computing
applications due to their inherent flexibility, data
parallelism, and algorithm acceleration capabilities.
Indeed, reconfigurable hardware seems to be the next
logical step in the agency paradigm, but only a few
attempts have been successful in implementing
multi-agent systems in these platforms. This article
discusses the problem of inter-agent communications in
Field Programmable Gate Arrays. It proposes a
Network-on-Chip in a hierarchical star topology to
enable agents' transactions through message
broadcasting using the Open Core Protocol as an
interface between hardware modules. A customizable
router microarchitecture is described and a multi-agent
system is created to simulate and analyse message
exchanges in a generic heavy traffic load agent-based
application. Experiments have shown a throughput of
1.6Gbps per port at 100MHz without packet loss and
seamless scalability characteristics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fraser:2017:FIK,
author = "Nicholas J. Fraser and Junkyu Lee and Duncan J. M.
Moss and Julian Faraone and Stephen Tridgell and Craig
T. Jin and Philip H. W. Leong",
title = "{FPGA} Implementations of Kernel Normalised Least Mean
Squares Processors",
journal = j-TRETS,
volume = "10",
number = "4",
pages = "26:1--26:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106744",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 29 07:28:53 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Kernel adaptive filters (KAFs) are online machine
learning algorithms which are amenable to highly
efficient streaming implementations. They require only
a single pass through the data and can act as universal
approximators, i.e. approximate any continuous function
with arbitrary accuracy. KAFs are members of a family
of kernel methods which apply an implicit non-linear
mapping of input data to a high dimensional feature
space, permitting learning algorithms to be expressed
entirely as inner products. Such an approach avoids
explicit projection into the feature space, enabling
computational efficiency. In this paper, we propose the
first fully pipelined implementation of the kernel
normalised least mean squares algorithm for regression.
Independent training tasks necessary for hyperparameter
optimisation fill pipeline stages, so no stall cycles
to resolve dependencies are required. Together with
other optimisations to reduce resource utilisation and
latency, our core achieves 161 GFLOPS on a Virtex 7
XC7VX485T FPGA for a floating point implementation and
211 GOPS for fixed point. Our PCI Express based
floating-point system implementation achieves 80\% of
the core's speed, this being a speedup of 10$ \times $
over an optimised implementation on a desktop processor
and 2.66$ \times $ over a GPU.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chu:2017:FCA,
author = "Thiem Van Chu and Shimpei Sato and Kenji Kise",
title = "Fast and Cycle-Accurate Emulation of Large-Scale
Networks-on-Chip Using a Single {FPGA}",
journal = j-TRETS,
volume = "10",
number = "4",
pages = "27:1--27:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3151758",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 29 07:28:53 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Modeling and simulation/emulation play a major role in
research and development of novel Networks-on-Chip
(NoCs). However, conventional software simulators are
so slow that studying NoCs for emerging many-core
systems with hundreds to thousands of cores is
challenging. State-of-the-art FPGA-based NoC emulators
have shown great potential in speeding up the NoC
simulation, but they cannot emulate large-scale NoCs
due to the FPGA capacity constraints. Moreover,
emulating large-scale NoCs under synthetic workloads on
FPGAs typically requires a large amount of memory and
thus involves the use of off-chip memory, which makes
the overall design much more complicated and may
substantially degrade the emulation speed. This article
presents methods for fast and cycle-accurate emulation
of NoCs with up to thousands of nodes using a single
FPGA. We first describe how to emulate a NoC under a
synthetic workload using only FPGA on-chip memory
(BRAMs). We next present a novel use of time-division
multiplexing where BRAMs are effectively used for
emulating a network using a small number of nodes,
thereby overcoming the FPGA capacity constraints. We
propose methods for emulating both direct and indirect
networks, focusing on the commonly used meshes and
fat-trees ( k -ary n -trees). This is different from
prior work that considers only direct networks. Using
the proposed methods, we build a NoC emulator, called
FNoC, and demonstrate the emulation of some mesh-based
and fat-tree-based NoCs with canonical router
architectures. Our evaluation results show that (1) the
size of the largest NoC that can be emulated depends on
only the FPGA on-chip memory capacity; (2) a mesh-based
NoC with 16,384 nodes (128$ \times $128 NoC) and a
fat-tree-based NoC with 6,144 switch nodes and 4,096
terminal nodes (4-ary 6-tree NoC) can be emulated using
a single Virtex-7 FPGA; and (3) when emulating these
two NoCs, we achieve, respectively, 5,047$ \times $ and
232$ \times $ speedups over BookSim, one of the most
widely used software-based NoC simulators, while
maintaining the same level of accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yoshimi:2017:PPJ,
author = "Masato Yoshimi and Yasin Oge and Tsutomu Yoshinaga",
title = "Pipelined Parallel Join and Its {FPGA}-Based
Acceleration",
journal = j-TRETS,
volume = "10",
number = "4",
pages = "28:1--28:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3079759",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 29 07:28:53 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "A huge amount of data is being generated and
accumulated in data centers, which leads to an
important increase in the required energy consumption
to analyze these data. Thus, we must consider the
redesign of current computer systems architectures to
be more friendly to applications based on distributed
algorithms that require a high data transfer rate.
Novel computer architectures that introduce dedicated
accelerators to enable near-data processing have been
discussed and developed for high-speed big-data
analysis. In this work, we propose a computer system
with an FPGA-based accelerator, namely,
interconnected-FPGAs, which offers two advantages: (1)
direct data transmission and (2) offloading computation
into data-flow in the FPGA. In this article, we
demonstrate the capability of the proposed
interconnected-FPGAs system to accelerate join
operations in a relational database. We developed a new
parallel join algorithm, PPJoin, targeted to big-data
analysis in a shared-nothing architecture. PPJoin is an
extended version of the NUMA-based parallel join
algorithm, created by overlapping computation by
multicore processors and data communication. The data
communication between computational nodes can be
accelerated by direct data transmission without passing
through the main memory of the hosts. To confirm the
performance of the PPJoin algorithm and its
acceleration process using an interconnected-FPGA
platform, we evaluated a simple query for large tables.
Additionally, to support availability, we also
evaluated the actual benchmark query. Our evaluation
results confirm that the PPJoin algorithm is faster
than a software-based query engine by 1.5--5 times.
Moreover, we experimentally confirmed that the direct
data transmission by interconnected FPGAs reduces
computational time around 20\% for PPJoin.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "28",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fabry:2017:ERA,
author = "Pieter Fabry and David Thomas",
title = "Efficient Reconfigurable Architecture for Pricing
Exotic Options",
journal = j-TRETS,
volume = "10",
number = "4",
pages = "29:1--29:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158228",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 29 07:28:53 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This article presents a new method for Monte Carlo
(MC) option pricing using field-programmable gate
arrays (FPGAs), which use a discrete-space random walk
over a binomial lattice, rather than the continuous
space-walks used by existing approaches. The underlying
hypothesis is that the discrete-space walk will
significantly reduce the area needed for each MC
engine, and the resulting increase in parallelisation
and raw performance outweighs any accuracy losses
introduced by the discretisation. Experimental results
support this hypothesis, showing that for a given MC
simulation size, there is no significant loss in
accuracy by using a discrete space model for the
path-dependent exotic financial options. Analysis of
the binomial simulation model shows that only
limited-precision fixed-point arithmetic is needed, and
also shows that pairs of MC kernels are able to share
RAM resources. When using realistic constraints on
pricing problems, it was found that the size of a
discrete-space MC engine can be kept to 370 Flip-Flops
and 233 Lookup Tables, allowing up to 3,000
variance-reduced MC cores in one FPGA. The combination
of a highly parallelisable architecture and
model-specific optimisations means that the binomial
pricing technique allows for a 50$ \times $ improvement
in throughput compared to existing FPGA approaches,
without any reduction in accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "29",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bakos:2018:ISS,
author = "Jason D. Bakos",
title = "Introduction to the Special Section on {FCCM'16}",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183572",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1e",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wong:2018:HPI,
author = "Henry Wong and Vaughn Betz and Jonathan Rose",
title = "High-Performance Instruction Scheduling Circuits for
Superscalar Out-of-Order Soft Processors",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3093741",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Soft processors have a role to play in simplifying
field-programmable gate array (FPGA) application design
as they can be deployed only when needed, and it is
easier to write and debug single-threaded software code
than create hardware. The breadth of this second role
increases when the performance of the soft processor
increases, yet the sophisticated out-of-order
superscalar approaches that arrived in the mid-1990s
are not employed, despite their area cost now being
easily tolerable. In this article, we take an important
step toward out-of-order execution in soft processors
by exploring instruction scheduling in an FPGA
substrate. This differs from the hard-processor design
problem because the logic substrate is restricted to
LUTs, whereas hard processor scheduling circuits employ
CAM and wired-OR structures to great benefit. We
discuss both circuit and microarchitectural trade-offs
and compare three circuit structures for the scheduler,
including a new structure called a fused-logic matrix
scheduler. Using our optimized circuits, we show that
four-issue distributed schedulers with up to 54 entries
can be built with the same cycle time as the commercial
Nios II/f soft processor (240MHz). This careful design
has the potential to significantly increase both the
IPC and raw compute performance of a soft processor,
compared to current commercial soft processors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Davis:2018:KHA,
author = "James J. Davis and Eddie Hung and Joshua M. Levine and
Edward A. Stott and Peter Y. K. Cheung and George A.
Constantinides",
title = "{KAPow}: High-Accuracy, Low-Overhead Online Per-Module
Power Estimation for {FPGA} Designs",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3129789",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In an FPGA system-on-chip design, it is often
insufficient to merely assess the power consumption of
the entire circuit by compile-time estimation or
runtime power measurement. Instead, to make better
decisions, one must understand the power consumed by
each module in the system. In this work, we combine
measurements of register-level switching activity and
system-level power to build an adaptive online model
that produces live breakdowns of power consumption
within the design. Online model refinement avoids
time-consuming characterization while also allowing the
model to track long-term operating condition changes.
Central to our method is an automated flow that selects
signals predicted to be indicative of high power
consumption, instrumenting them for monitoring. We
named this technique KAPow, for `K'ounting Activity for
Power estimation, which we show to be accurate and to
have low overheads across a range of representative
benchmarks. We also propose a strategy allowing for the
identification and subsequent elimination of counters
found to be of low significance at runtime, reducing
algorithmic complexity without sacrificing significant
accuracy. Finally, we demonstrate an application
example in which a module-level power breakdown can be
used to determine an efficient mapping of tasks to
modules and reduce system-wide power consumption by up
to 7\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Giesen:2018:COS,
author = "Hans Giesen and Benjamin Gojman and Raphael Rubin and
Ji Kim and Andr{\'e} Dehon",
title = "Continuous Online Self-Monitoring Introspection
Circuitry for Timing Repair by Incremental
Partial-Reconfiguration {(COSMIC TRIP)}",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158229",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "We show that continuously monitoring on-chip delays at
the LUT-to-LUT link level during operation allows a
field-programmable gate array to detect and self-adapt
to aging and environmental timing effects. Using a
lightweight ($ < 4 \% $ added area) mechanism for
monitoring transition timing, a Difference Detector
with First-Fail Latch, we can estimate the timing
margin on circuits and identify the individual links
that have degraded and whose delay is determining the
worst-case circuit delay. Combined with
Choose-Your-own-Adventure precomputed, fine-grained
repair alternatives, we introduce a strategy for rapid,
in-system incremental repair of links with degraded
timing. We show that these techniques allow us to
respond to a single aging event in less than 190ms for
the toronto20 benchmarks. The result is a step toward
systems where adaptive reconfiguration on the
time-scale of seconds is viable and beneficial.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhao:2018:FGM,
author = "Zhuoran Zhao and Nguyen T. H. Nguyen and Dimitris
Agiakatsikas and Ganghee Lee and Ediz Cetin and Oliver
Diessel",
title = "Fine-Grained Module-Based Error Recovery in
{FPGA}-Based {TMR} Systems",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3173549",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Space processing applications deployed on SRAM-based
Field Programmable Gate Arrays (FPGAs) are vulnerable
to radiation-induced Single Event Upsets (SEUs).
Compared with the well-known SEU mitigation
solution-Triple Modular Redundancy (TMR) with
configuration memory scrubbing-TMR with module-based
error recovery (MER) is notably more energy efficient
and responsive in repairing soft-errors in the system.
Unfortunately, TMR-MER systems also need to resort to
scrubbing when errors occur between sub-components,
such as in interconnection nets, which are not
recovered by MER. This article addresses this problem
by proposing a fine-grained module-based error recovery
technique, which can localize and correct errors that
classic MER fails to do without additional system
hardware. We evaluate our proposal via fault-injection
campaigns on three types of circuits implemented in
Xilinx 7-Series devices. With respect to scrubbing, we
observed reductions in the mean time to repair
configuration memory errors of between 48.5\% and
89.4\%, while reductions in energy used recovering from
configuration memory errors were estimated at between
77.4\% and 96.1\%. These improvements result in higher
reliability for systems employing TMR with fine-grained
reconfiguration than equivalent systems relying on
scrubbing for configuration error recovery.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{AlKadi:2018:GPC,
author = "Muhammed {Al Kadi} and Benedikt Janssen and Jones Yudi
and Michael Huebner",
title = "General-Purpose Computing with Soft {GPUs} on
{FPGAs}",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3173548",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Using field-programmable gate arrays (FPGAs) as a
substrate to deploy soft graphics processing units
(GPUs) would enable offering the FPGA compute power in
a very flexible GPU-like tool flow.
Application-specific adaptations like selective
hardening of floating-point operations and instruction
set subsetting would mitigate the high area and power
demands of soft GPUs. This work explores the
capabilities and limitations of soft General Purpose
Computing on GPUs (GPGPU) for both fixed- and floating
point arithmetic. For this purpose, we have developed
FGPU: a configurable, scalable, and portable GPU
architecture designed especially for FPGAs. FGPU is
open-source and implemented entirely in RTL. It can be
programmed in OpenCL and controlled through a Python
API. This article introduces its hardware architecture
as well as its tool flow. We evaluated the proposed
GPGPU approach against multiple other solutions. In
comparison to homogeneous Multi-Processor
System-On-Chips (MPSoCs), we found that using a soft
GPU is a Pareto-optimal solution regarding throughput
per area and energy consumption. On average, FGPU has a
2.9$ \times $ better compute density and 11.2$ \times $
less energy consumption than a single MicroBlaze
processor when computing in IEEE-754 floating-point
format. An average speedup of about 4$ \times $ over
the ARM Cortex-A9 supported with the NEON vector
co-processor has been measured for fixed- or
floating-point benchmarks. In addition, the biggest
FGPU cores we could implement on a Xilinx Zynq-7000
System-On-Chip (SoC) can deliver similar performance to
equivalent implementations with High-Level Synthesis
(HLS).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tatsumura:2018:EFM,
author = "Kosuke Tatsumura and Sadegh Yazdanshenas and Vaughn
Betz",
title = "Enhancing {FPGAs} with Magnetic Tunnel Junction-Based
Block {RAMs}",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3154425",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "While plentiful on-chip memory is necessary for many
designs to fully utilize an FPGA's computational
capacity, SRAM scaling is becoming more difficult
because of increasing device variation. An alternative
is to build FPGA block RAM (BRAM) from magnetic tunnel
junctions (MTJ), as this emerging embedded memory has a
small cell size, low energy usage, and good
scalability. We conduct a detailed comparison study of
SRAM and MTJ BRAMs that includes cell designs that are
robust with device variation, transistor-level design
and optimization of all the required BRAM-specific
circuits, and variation-aware simulation at the 22nm
node. At a 256Kb block size, MTJ-BRAM is 3.06$ \times $
denser and 55\% more energy efficient and its F$_{max}$
is 274MHz, which is adequate for most FPGA system clock
domains. We also detail further enhancements that allow
these 256 Kb MTJ BRAMs to operate at a higher speed of
353MHz for the streaming FIFOs, which are very common
in FPGA designs and describe how the non-volatility of
MTJ BRAM enables novel on-chip configuration and
power-down modes. For a RAM architecture similar to the
latest commercial FPGAs, MTJ-BRAMs could expand FPGA
memory capacity by 2.95$ \times $ with no die size
increase.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Stewart:2018:RPI,
author = "Robert Stewart and Kirsty Duncan and Greg Michaelson
and Paulo Garcia and Deepayan Bhowmik and Andrew
Wallace",
title = "{RIPL}: a Parallel Image Processing Language for
{FPGAs}",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3180481",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Specialized FPGA implementations can deliver higher
performance and greater power efficiency than embedded
CPU or GPU implementations for real-time image
processing. Programming challenges limit their wider
use, because the implementation of FPGA architectures
at the register transfer level is time consuming and
error prone. Existing software languages supported by
high-level synthesis (HLS), although providing a
productivity improvement, are too general purpose to
generate efficient hardware without the use of
hardware-specific code optimizations. Such
optimizations leak hardware details into the
abstractions that software languages are there to
provide, and they require knowledge of FPGAs to
generate efficient hardware, such as by using language
pragmas to partition data structures across memory
blocks. This article presents a thorough account of the
Rathlin image processing language (RIPL), a high-level
image processing domain-specific language for FPGAs. We
motivate its design, based on higher-order algorithmic
skeletons, with requirements from the image processing
domain. RIPL's skeletons suffice to elegantly describe
image processing stencils, as well as recursive
algorithms with nonlocal random access patterns. At its
core, RIPL employs a dataflow intermediate
representation. We give a formal account of the
compilation scheme from RIPL skeletons to static and
cyclostatic dataflow models to describe their data
rates and static scheduling on FPGAs. RIPL compares
favorably to the Vivado HLS OpenCV library and C++
compiled with Vivado HLS. RIPL achieves between 54 and
191 frames per second (FPS) at 100MHz for four
synthetic benchmarks, faster than HLS OpenCV in three
cases. Two real-world algorithms are implemented in
RIPL: visual saliency and mean shift segmentation. For
the visual saliency algorithm, RIPL achieves 71 FPS
compared to optimized C++ at 28 FPS. RIPL is also
concise, being 5x shorter than C++ and 111x shorter
than an equivalent direct dataflow implementation. For
mean shift segmentation, RIPL achieves 7 FPS compared
to optimized C++ on 64 CPU cores at 1.1, and RIPL is
10x shorter than the direct dataflow FPGA
implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Khan:2018:EAM,
author = "Farheen Fatima Khan and Andy Ye",
title = "An Evaluation on the Accuracy of the Minimum-Width
Transistor Area Models in Ranking the Layout Area of
{FPGA} Architectures",
journal = j-TRETS,
volume = "11",
number = "1",
pages = "8:1--8:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3182394",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:42:59 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "This work provides an evaluation on the accuracy of
the minimum-width transistor area models in ranking the
actual layout area of FPGA architectures. Both the
original VPR area model and the new COFFE area model
are compared against the actual layouts with up to
three metal layers for the various FPGA building
blocks. We found that both models have significant
variations with respect to the accuracy of their
predictions across the building blocks. In particular,
the original VPR model overestimates the layout area of
larger buffers, full adders, and multiplexers by as
much as 38\%, while they underestimate the layout area
of smaller buffers and multiplexers by as much as 58\%,
for an overall prediction error variation of 96\%. The
newer COFFE model also significantly overestimates the
layout area of full adders by 13\% and underestimates
the layout area of multiplexers by a maximum of 60\%
for a prediction error variation of 73\%. Such
variations are particularly significant considering
sensitivity analyses are not routinely performed in
FPGA architectural studies. Our results suggest that
such analyses are extremely important in studies that
employ the minimum-width area models so the tolerance
of the architectural conclusions against the prediction
error variations can be quantified. Furthermore, an
open-source version of the layouts of the actual FPGA
building blocks should be created so their actual
layout area can be used to achieve a highly accurate
ranking of the implementation area of FPGA
architectures built upon these layouts.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wijesundera:2018:FRP,
author = "Deshya Wijesundera and Alok Prakash and Thambipillai
Srikanthan and Achintha Ihalage",
title = "Framework for Rapid Performance Estimation of Embedded
Soft Core Processors",
journal = j-TRETS,
volume = "11",
number = "2",
pages = "9:1--9:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3195801",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The large number of embedded soft core processors
available today make it tedious and time consuming to
select the best processor for a given application. This
task is even more challenging due to the numerous
configuration options available for a single soft core
processor while optimizing for contradicting design
requirements such as performance and area. In this
article, we propose a generic framework for rapid
performance estimation of applications on soft core
processors. The proposed technique is scalable to the
large number of configuration options available in
modern soft core processors by relying on rapid and
accurate estimation models instead of time-consuming
FPGA synthesis and execution-based techniques.
Experimental results on two leading commercial soft
core processors executing applications from the widely
used CHStone benchmark suite show an average error of
less than 6\% while running in the order of minutes
when compared to hours taken by synthesis-based
techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rossi:2018:PPR,
author = "Enrico Rossi and Marvin Damschen and Lars Bauer and
Giorgio Buttazzo and J{\"o}rg Henkel",
title = "Preemption of the Partial Reconfiguration Process to
Enable Real-Time Computing With {FPGAs}",
journal = j-TRETS,
volume = "11",
number = "2",
pages = "10:1--10:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3182183",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "To improve computing performance in real-time
applications, modern embedded platforms comprise
hardware accelerators that speed up the task's most
compute-intensive parts. A recent trend in the design
of real-time embedded systems is to integrate
field-programmable gate arrays (FPGA) that are
reconfigured with different accelerators at runtime, to
cope with dynamic workloads that are subject to timing
constraints. One of the major limitations when dealing
with partial FPGA reconfiguration in real-time systems
is that the reconfiguration port can only perform one
reconfiguration at a time: if a high-priority task
issues a reconfiguration request while the
reconfiguration port is already occupied by a
lower-priority task, the high-priority task has to wait
until the current reconfiguration is completed (a
phenomenon known as priority inversion ), unless the
current reconfiguration is aborted (introducing
unbounded delays in low-priority tasks, a phenomenon
known as starvation ). This article shows how priority
inversion and starvation can be solved by making the
reconfiguration process preemptive -that is, allowing
it to be interrupted at any time and resumed at a later
time without restarting it from scratch. Such a feature
is crucial for the design of runtime reconfigurable
real-time systems but not yet available in today's
platforms. Furthermore, the trade-off of achieving a
guaranteed bound on the reconfiguration delay for
low-priority tasks and the maximum delay induced for
high-priority tasks when preempting an ongoing
reconfiguration has been identified and analyzed.
Experimental results on the Xilinx Zynq-7000 platform
show that the proposed implementation of preemptive
reconfiguration introduces a low runtime overhead, thus
effectively solving priority inversion and
starvation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Petelin:2018:WEF,
author = "Oleg Petelin and Vaughn Betz",
title = "{Wotan}: Evaluating {FPGA} Architecture Routability
without Benchmarks",
journal = j-TRETS,
volume = "11",
number = "2",
pages = "11:1--11:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3195800",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "FPGA routing architectures consist of routing wires
and programmable switches that together account for the
majority of the fabric delay and area, making
evaluation and optimization of an FPGA's routing
architecture very important. Routing architectures have
traditionally been evaluated using a full synthesize,
pack, place and route CAD flow over a suite of
benchmark circuits. While the results are accurate, a
full CAD flow has a long runtime and is often tuned to
a specific FPGA architecture type, which limits
exploration of different architecture options early in
the design process. In this article, we present Wotan,
a tool to quickly estimate routability for a wide range
of architectures without the use of benchmark circuits.
At its core, our routability predictor efficiently
counts paths through the FPGA routing graph to (1)
estimate the probability of node congestion and (2)
estimate the probabilities to successfully route a
randomized subset of (source, sink) pairs, which are
then combined into an overall routability metric. We
describe our predictor and present routability
estimates for a range of 6-LUT and 4-LUT architectures
using mixes of wire types connected in complex ways,
showing a rank correlation of 0.91 with routability
results from the full VPR CAD flow while requiring 18$
\times $ less CPU effort.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Anandakumar:2018:RHA,
author = "N. Nalla Anandakumar and M. Prem Laxman Das and
Somitra K. Sanadhya and Mohammad S. Hashmi",
title = "Reconfigurable Hardware Architecture for Authenticated
Key Agreement Protocol Over Binary {Edwards} Curve",
journal = j-TRETS,
volume = "11",
number = "2",
pages = "12:1--12:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231743",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In this article, we present a high-performance
hardware architecture for Elliptic curve based
(authenticated) key agreement protocol ``Elliptic Curve
Menezes, Qu and Vanstone'' (ECMQV) over Binary Edwards
Curve (BEC). We begin by analyzing inversion module on
a 251-bit binary field. Subsequently, we present Field
Programmable Gate Array (FPGA) implementations of the
unified formula for computing elliptic curve point
addition on BEC in affine and projective coordinates
and investigate the relative performance of these two
coordinates. Then, we implement the w -coordinate based
differential addition formulae suitable for usage in
Montgomery ladder. Next, we present a novel hardware
architecture of BEC point multiplication using mixed w
-coordinates of the Montgomery laddering algorithm and
analyze it in terms of resistance to Simple Power
Analysis (SPA) attack. In order to improve the
performance, the architecture utilizes registers
efficiently and uses efficient scheduling mechanisms
for the BEC arithmetic implementations. Our
implementation results show that the proposed
architecture is resistant against SPA attack and yields
a better performance when compared to the existing
state-of-the-art BEC designs for computing point
multiplication (PM). Finally, we present an FPGA design
of ECMQV key agreement protocol using BEC defined over
GF(2$^{251}$ ). The execution of ECMQV protocol takes
66.47 $ \mu $ s using 32,479 slices on Virtex-4 FPGA
and 52.34 $ \mu $ s using 15,988 slices on Virtex-5
FPGA. To the best of our knowledge, this is the first
FPGA design of the ECMQV protocol using BEC.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Daigneault:2018:ASS,
author = "Marc-Andre Daigneault and Jean Pierre David",
title = "Automated Synthesis of Streaming Transfer Level
Hardware Designs",
journal = j-TRETS,
volume = "11",
number = "2",
pages = "13:1--13:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3243930",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "As modern field-programmable gate arrays (FPGA) enable
high computing performance and efficiency, their
programming with low-level hardware description
languages is time-consuming and remains a major
obstacle to their adoption. High-level synthesis
compilers are able to produce register-transfer-level
(RTL) designs from C/C++ algorithmic descriptions, but
despite allowing significant design-time improvements,
these tools are not always able to generate hardware
designs that compare to handmade RTL designs. In this
article, we consider synthesis from an
intermediate-level (IL) language that allows the
description of algorithmic state machines handling
connections between streaming sources and sinks.
However, the interconnection of streaming sources and
sinks can lead to cyclic combinational relations,
resulting in undesirable behaviors or un-synthesizable
designs. We propose a functional-level methodology to
automate the resolution of such cyclic relations into
acyclic combinational functions. The proposed IL
synthesis methodology has been applied to the design of
pipelined floating-point cores. The results obtained
show how the proposed IL methodology can simplify the
description of pipelined architectures while enabling
performances that are close to those achievable through
an RTL design methodology.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2018:ISS,
author = "Deming Chen and Andrew Putnam and Steve Wilton",
title = "Introduction to the Special Section on Deep Learning
in {FPGAs}",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "14:1--14:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3294768",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Prost-Boucle:2018:HEC,
author = "Adrien Prost-Boucle and Alban Bourge and
Fr{\'e}d{\'e}ric P{\'e}trot",
title = "High-Efficiency Convolutional Ternary Neural Networks
with Custom Adder Trees and Weight Compression",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "15:1--15:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3270764",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Although performing inference with artificial neural
networks (ANN) was until quite recently considered as
essentially compute intensive, the emergence of deep
neural networks coupled with the evolution of the
integration technology transformed inference into a
memory bound problem. This ascertainment being
established, many works have lately focused on
minimizing memory accesses, either by enforcing and
exploiting sparsity on weights or by using few bits for
representing activations and weights, to be able to use
ANNs inference in embedded devices. In this work, we
detail an architecture dedicated to inference using
ternary {-1, 0, 1} weights and activations. This
architecture is configurable at design time to provide
throughput vs. power trade-offs to choose from. It is
also generic in the sense that it uses information
drawn for the target technologies (memory geometries
and cost, number of available cuts, etc.) to adapt at
best to the FPGA resources. This allows to achieve up
to 5.2k frames per second per Watt for classification
on a VC709 board using approximately half of the
resources of the FPGA.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Blott:2018:FRE,
author = "Michaela Blott and Thomas B. Preu{\ss}er and Nicholas
J. Fraser and Giulio Gambardella and Kenneth O'brien
and Yaman Umuroglu and Miriam Leeser and Kees Vissers",
title = "{FINN-R}: an End-to-End Deep-Learning Framework for
Fast Exploration of Quantized Neural Networks",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "16:1--16:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3242897",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Convolutional Neural Networks have rapidly become the
most successful machine-learning algorithm, enabling
ubiquitous machine vision and intelligent decisions on
even embedded computing systems. While the underlying
arithmetic is structurally simple, compute and memory
requirements are challenging. One of the promising
opportunities is leveraging reduced-precision
representations for inputs, activations, and model
parameters. The resulting scalability in performance,
power efficiency, and storage footprint provides
interesting design compromises in exchange for a small
reduction in accuracy. FPGAs are ideal for exploiting
low-precision inference engines leveraging custom
precisions to achieve the required numerical accuracy
for a given application. In this article, we describe
the second generation of the FINN framework, an
end-to-end tool that enables design-space exploration
and automates the creation of fully customized
inference engines on FPGAs. Given a neural network
description, the tool optimizes for given platforms,
design targets, and a specific precision. We introduce
formalizations of resource cost functions and
performance predictions and elaborate on the
optimization algorithms. Finally, we evaluate a
selection of reduced precision neural networks ranging
from CIFAR-10 classifiers to YOLO-based object
detection on a range of platforms including PYNQ and
AWS F1, demonstrating new unprecedented measured
throughput at 50 TOp/s on AWS F1 and 5 TOp/s on
embedded devices.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ding:2018:LLH,
author = "Ruizhou Ding and Zeye Liu and R. D. (Shawn) Blanton
and Diana Marculescu",
title = "Lightening the Load with Highly Accurate Storage- and
Energy-Efficient {LightNNs}",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "17:1--17:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3270689",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Hardware implementations of deep neural networks
(DNNs) have been adopted in many systems because of
their higher classification speed. However, while they
may be characterized by better accuracy, larger DNNs
require significant energy and area, thereby limiting
their wide adoption. The energy consumption of DNNs is
driven by both memory accesses and computation.
Binarized neural networks (BNNs), as a tradeoff between
accuracy and energy consumption, can achieve great
energy reduction and have good accuracy for large DNNs
due to their regularization effect. However, BNNs show
poor accuracy when a smaller DNN configuration is
adopted. In this article, we propose a new DNN
architecture, LightNN, which replaces the
multiplications to one shift or a constrained number of
shifts and adds. Our theoretical analysis for LightNNs
shows that their accuracy is maintained while
dramatically reducing storage and energy requirements.
For a fixed DNN configuration, LightNNs have better
accuracy at a slight energy increase than BNNs, yet are
more energy efficient with only slightly less accuracy
than conventional DNNs. Therefore, LightNNs provide
more options for hardware designers to trade off
accuracy and energy. Moreover, for large DNN
configurations, LightNNs have a regularization effect,
making them better in accuracy than conventional DNNs.
These conclusions are verified by experiment using the
MNIST and CIFAR-10 datasets for different DNN
configurations. Our FPGA implementation for
conventional DNNs and LightNNs confirms all theoretical
and simulation results and shows that LightNNs reduce
latency and use fewer FPGA resources compared to
conventional DNN architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Meloni:2018:NEC,
author = "Paolo Meloni and Alessandro Capotondi and Gianfranco
Deriu and Michele Brian and Francesco Conti and Davide
Rossi and Luigi Raffo and Luca Benini",
title = "{NEURAghe}: Exploiting {CPU--FPGA} Synergies for
Efficient and Flexible {CNN} Inference Acceleration on
{Zynq SoCs}",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "18:1--18:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3284357",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Deep convolutional neural networks (CNNs) obtain
outstanding results in tasks that require human-level
understanding of data, like image or speech
recognition. However, their computational load is
significant, motivating the development of
CNN-specialized accelerators. This work presents NEURA
ghe, a flexible and efficient hardware/software
solution for the acceleration of CNNs on Zynq SoCs.
NEURAghe leverages the synergistic usage of Zynq ARM
cores and of a powerful and flexible
Convolution-Specific Processor deployed on the
reconfigurable logic. The Convolution-Specific
Processor embeds both a convolution engine and a
programmable soft core, releasing the ARM processors
from most of the supervision duties and allowing the
accelerator to be controlled by software at an
ultra-fine granularity. This methodology opens the way
for cooperative heterogeneous computing: While the
accelerator takes care of the bulk of the CNN workload,
the ARM cores can seamlessly execute hard-to-accelerate
parts of the computational graph, taking advantage of
the NEON vector engines to further speed up
computation. Through the companion NeuDNN SW stack,
NEURAghe supports end-to-end CNN-based classification
with a peak performance of 169GOps/s, and an energy
efficiency of 17GOps/W. Thanks to our heterogeneous
computing model, our platform improves upon the
state-of-the-art, achieving a frame rate of 5.5 frames
per second (fps) on the end-to-end execution of VGG-16
and 6.6fps on ResNet-18.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2018:OCB,
author = "Shuanglong Liu and Hongxiang Fan and Xinyu Niu and
Ho-cheung Ng and Yang Chu and Wayne Luk",
title = "Optimizing {CNN}-based Segmentation with Deeply
Customized Convolutional and Deconvolutional
Architectures on {FPGA}",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "19:1--19:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3242900",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Convolutional Neural Networks-- (CNNs) based
algorithms have been successful in solving image
recognition problems, showing very large accuracy
improvement. In recent years, deconvolution layers are
widely used as key components in the state-of-the-art
CNNs for end-to-end training and models to support
tasks such as image segmentation and super resolution.
However, the deconvolution algorithms are
computationally intensive, which limits their
applicability to real-time applications. Particularly,
there has been little research on the efficient
implementations of deconvolution algorithms on FPGA
platforms that have been widely used to accelerate CNN
algorithms by practitioners and researchers due to
their high performance and power efficiency. In this
work, we propose and develop deconvolution architecture
for efficient FPGA implementation. FPGA-based
accelerators are proposed for both deconvolution and
CNN algorithms. Besides, memory sharing between the
computation modules is proposed for the FPGA-based CNN
accelerator as well as for other optimization
techniques. A non-linear optimization model based on
the performance model is introduced to efficiently
explore the design space to achieve optimal processing
speed of the system and improve power efficiency.
Furthermore, a hardware mapping framework is developed
to automatically generate the low-latency hardware
design for any given CNN model on the target device.
Finally, we implement our designs on Xilinx Zynq ZC706
board and the deconvolution accelerator achieves a
performance of 90.1 giga operations per second (GOPS)
under 200MHz working frequency and a performance
density of 0.10 GOPS/DSP using 32-bit quantization,
which significantly outperforms previous designs on
FPGAs. A real-time application of scene segmentation on
Cityscapes Dataset is used to evaluate our CNN
accelerator on Zynq ZC706 board, and the system
achieves a performance of 107 GOPS and 0.12 GOPS/DSP
using 16-bit quantization and supports up to 17 frames
per second for 512 $ \times $ 512 image inputs with a
power consumption of only 9.6W.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Boutros:2018:YCI,
author = "Andrew Boutros and Sadegh Yazdanshenas and Vaughn
Betz",
title = "You Cannot Improve What You Do not Measure: {FPGA} vs.
{ASIC} Efficiency Gaps for Convolutional Neural Network
Inference",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "20:1--20:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3242898",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Recently, deep learning (DL) has become best-in-class
for numerous applications but at a high computational
cost that necessitates high-performance
energy-efficient acceleration. The reconfigurability of
FPGAs is appealing due to the rapid change in DL models
but also causes lower performance and area-efficiency
compared to ASICs. In this article, we implement three
state-of-the-art computing architectures (CAs) for
convolutional neural network (CNN) inference on FPGAs
and ASICs. By comparing the FPGA and ASIC
implementations, we highlight the area and performance
costs of programmability to pinpoint the inefficiencies
in current FPGA architectures. We perform our
experiments using three variations of these CAs for
AlexNet, VGG-16 and ResNet-50 to allow extensive
comparisons. We find that the performance gap varies
significantly from 2.8$ \times $ to 6.3$ \times $,
while the area gap is consistent across CAs with an 8.7
average FPGA-to-ASIC area ratio. Among different blocks
of the CAs, the convolution engine, constituting up to
60\% of the total area, has a high area ratio ranging
from 13 to 31. Motivated by our FPGA vs. ASIC
comparisons, we suggest FPGA architectural changes such
as increasing DSP block count, enhancing low-precision
support in DSP blocks and rethinking the on-chip
memories to reduce the programmability gap for DL
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rouhani:2018:RRT,
author = "Bita Darvish Rouhani and Siam Umar Hussain and Kristin
Lauter and Farinaz Koushanfar",
title = "{ReDCrypt}: Real-Time Privacy-Preserving Deep Learning
Inference in Clouds Using {FPGAs}",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "21:1--21:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3242899",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "Artificial Intelligence (AI) is increasingly
incorporated into the cloud business in order to
improve the functionality (e.g., accuracy) of the
service. The adoption of AI as a cloud service raises
serious privacy concerns in applications where the risk
of data leakage is not acceptable. Examples of such
applications include scenarios where clients hold
potentially sensitive private information such as
medical records, financial data, and/or location. This
article proposes ReDCrypt, the first reconfigurable
hardware-accelerated framework that empowers
privacy-preserving inference of deep learning models in
cloud servers. ReDCrypt is well-suited for streaming
(a.k.a., real-time AI) settings where clients need to
dynamically analyze their data as it is collected over
time without having to queue the samples to meet a
certain batch size. Unlike prior work, ReDCrypt neither
requires to change how AI models are trained nor relies
on two non-colluding servers to perform. The
privacy-preserving computation in ReDCrypt is executed
using Yao's Garbled Circuit (GC) protocol. We break
down the deep learning inference task into two phases:
(i) privacy-insensitive (local) computation, and (ii)
privacy-sensitive (interactive) computation. We devise
a high-throughput and power-efficient implementation of
GC protocol on FPGA for the privacy-sensitive phase.
ReDCrypt's accompanying API provides support for
seamless integration of ReDCrypt into any deep learning
framework. Proof-of-concept evaluations for different
DL applications demonstrate up to 57-fold higher
throughput per core compared to the best prior solution
with no drop in the accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yu:2018:IDC,
author = "Jincheng Yu and Guangjun Ge and Yiming Hu and Xuefei
Ning and Jiantao Qiu and Kaiyuan Guo and Yu Wang and
Huazhong Yang",
title = "Instruction Driven Cross-layer {CNN} Accelerator for
Fast Detection on {FPGA}",
journal = j-TRETS,
volume = "11",
number = "3",
pages = "22:1--22:??",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3283452",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:00 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "In recent years, Convolutional Neural Networks (CNNs)
have been widely applied in computer vision and have
achieved significant improvements in object detection
tasks. Although there are many optimizing methods to
speed up CNN-based detection algorithms, it is still
difficult to deploy detection algorithms on real-time
low-power systems. Field-Programmable Gate Array (FPGA)
has been widely explored as a platform for accelerating
CNN due to its promising performance, high energy
efficiency, and flexibility. Previous works show that
the energy consumption of CNN accelerators is dominated
by the memory access. By fusing multiple layers in CNN,
the intermediate data transfer can be reduced. However,
previous accelerators with the cross-layer scheduling
are designed for a particular CNN model. In addition to
the memory access optimization, the Winograd algorithm
can greatly improve the computational performance of
convolution. In this article, to improve the
flexibility of hardware, we design an
instruction-driven CNN accelerator, supporting the
Winograd algorithm and the cross-layer scheduling, for
object detection. We modify the loop unrolling order of
CNN, so that we can schedule a CNN across different
layers with instructions and eliminate the intermediate
data transfer. We propose a hardware architecture to
support the instructions with Winograd computation
units and reach the state-of-the-art energy efficiency.
To deploy image detection algorithms onto the proposed
accelerator with fixed-point computation units, we
adopt the fixed-point fine-tune method, which can
guarantee the accuracy of the detection algorithms. We
evaluate our accelerator and scheduling policy on the
Xilinx KU115 FPGA platform. The intermediate data
transfer can be reduced by more than 90\% on the VGG-D
CNN model with the cross-layer strategy. Thus, the
performance of our hardware accelerator reaches
1700GOP/s on the classification model VGG-D. We also
implement a framework for object detection algorithms,
which achieves 2.3$ \times $ and 50$ \times $ in energy
efficiency compared with GPU and CPU, respectively.
Compared with floating-point algorithms, the accuracy
of the fixed-point detection algorithms only drops by
less than 1\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2018:EMP,
author = "Wensong Li and Fan Yang and Hengliang Zhu and Xuan
Zeng and Dian Zhou",
title = "An Efficient Memory Partitioning Approach for
Multi-Pattern Data Access via Data Reuse",
journal = j-TRETS,
volume = "12",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301296",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3301296",
abstract = "Memory bandwidth has become a bottleneck that impedes
performance improvement during the parallelism
optimization of the datapath. Memory partitioning is a
practical approach to reduce bank-level conflicts and
increase the bandwidth on a field-programmable gate
array. In this work, we propose a memory partitioning
approach for multi-pattern data access. First, we
propose to combine multiple patterns into a single
pattern to reduce the complexity of multi-pattern.
Then, we propose to perform data reuse analysis on the
combined pattern to find data reuse opportunities and
the non-reusable data pattern. Finally, an efficient
bank mapping algorithm with low complexity and low
overhead is proposed to find the optimal memory
partitioning solution. Experimental results
demonstrated that compared to the state-of-the-art
method, our proposed approach can reduce the number of
block RAMS by 58.9\% on average, with 79.6\% reduction
in SLICEs, 85.3\% reduction in LUTs, 67.9\% in
reduction Flip-Flops, 54.6\% reduction in DSP48Es,
83.9\% reduction in SRLs, 50.0\% reduction in storage
overhead, 95.0\% reduction in execution time, and
77.3\% reduction in dynamic power consumption on
average. Meanwhile, the performance can be improved by
14.0\% on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Guo:2018:DSF,
author = "Kaiyuan Guo and Shulin Zeng and Jincheng Yu and Yu
Wang and Huazhong Yang",
title = "{[DL]} A Survey of {FPGA}-based Neural Network
Inference Accelerators",
journal = j-TRETS,
volume = "12",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3289185",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3289185",
abstract = "Recent research on neural networks has shown a
significant advantage in machine learning over
traditional algorithms based on handcrafted features
and models. Neural networks are now widely adopted in
regions like image, speech, and video recognition. But
the high computation and storage complexity of neural
network inference poses great difficulty on its
application. It is difficult for CPU platforms to offer
enough computation capacity. GPU platforms are the
first choice for neural network processes because of
its high computation capacity and easy-to-use
development frameworks. However, FPGA-based neural
network inference accelerator is becoming a research
topic. With specifically designed hardware, FPGA is the
next possible solution to surpass GPU in speed and
energy efficiency. Various FPGA-based accelerator
designs have been proposed with software and hardware
optimization techniques to achieve high speed and
energy efficiency. In this article, we give an overview
of previous work on neural network inference
accelerators based on FPGA and summarize the main
techniques used. An investigation from software to
hardware, from circuit level to system level is carried
out to complete analysis of FPGA-based neural network
inference accelerator design and serves as a guide to
future work.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yazdanshenas:2018:CAM,
author = "Sadegh Yazdanshenas and Vaughn Betz",
title = "{COFFE 2}: Automatic Modelling and Optimization of
Complex and Heterogeneous {FPGA} Architectures",
journal = j-TRETS,
volume = "12",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301298",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3301298",
abstract = "FPGAs are becoming more heterogeneous to better adapt
to different markets, motivating rapid exploration of
different blocks/tiles for FPGAs. To evaluate a new
FPGA architectural idea, one should be able to
accurately obtain the area, delay, and energy
consumption of the block of interest. However, current
FPGA circuit design tools can only model simple,
homogeneous FPGA architectures with basic logic blocks
and also lack DSP and other heterogeneous block
support. Modern FPGAs are instead composed of many
different tiles, some of which are designed in a full
custom style and some of which mix standard cell and
full custom styles. To fill this modelling gap, we
introduce COFFE 2, an open-source FPGA design toolset
for automatic FPGA circuit design. COFFE 2 uses a mix
of full custom and standard cell flows and supports not
only complex logic blocks with fracturable lookup
tables and hard arithmetic but also arbitrary
heterogeneous blocks. To validate COFFE 2 and
demonstrate its features, we design and evaluate a
multi-mode Stratix III-like DSP block and several logic
tiles with fracturable LUTs and hard arithmetic. We
also demonstrate how COFFE 2's interface to VTR allows
full evaluation of block-routing interfaces and various
fracturable 6-LUT architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Choi:2018:DAM,
author = "Young-Kyu Choi and Jason Cong and Zhenman Fang and
Yuchen Hao and Glenn Reinman and Peng Wei",
title = "In-Depth Analysis on Microarchitectures of Modern
Heterogeneous {CPU--FPGA} Platforms",
journal = j-TRETS,
volume = "12",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3294054",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3294054",
abstract = "Conventional homogeneous multicore processors are not
able to provide the continued performance and energy
improvement that we have expected from past endeavors.
Heterogeneous architectures that feature specialized
hardware accelerators are widely considered a promising
paradigm for resolving this issue. Among different
heterogeneous devices, FPGAs that can be reconfigured
to accelerate a broad class of applications with
orders-of-magnitude performance/watt gains, are
attracting increased attention from both academia and
industry. As a consequence, a variety of CPU--FPGA
acceleration platforms with diversified
microarchitectural features have been supplied by
industry vendors. Such diversity, however, poses a
serious challenge to application developers in
selecting the appropriate platform for a specific
application or application domain. This article aims to
address this challenge by determining which
microarchitectural characteristics affect performance,
and in what ways. Specifically, we conduct a
quantitative comparison and an in-depth analysis on
five state-of-the-art CPU--FPGA acceleration platforms:
(1) the Alpha Data board and (2) the Amazon F1 instance
that represent the traditional PCIe-based platform with
private device memory; (3) the IBM CAPI that represents
the PCIe-based system with coherent shared memory; (4)
the first generation of the Intel Xeon+FPGA Accelerator
Platform that represents the QPI-based system with
coherent shared memory; and (5) the second generation
of the Intel Xeon+FPGA Accelerator Platform that
represents a hybrid PCIe-based (non-coherent) and
QPI-based (coherent) system with shared memory. Based
on the analysis of their CPU--FPGA communication
latency and bandwidth characteristics, we provide a
series of insights for both application developers and
platform designers. Furthermore, we conduct two case
studies to demonstrate how these insights can be
leveraged to optimize accelerator designs. The
microbenchmarks used for evaluation have been released
for public use.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cao:2018:FRA,
author = "Shijie Cao and Lanshun Nie and Dechen Zhan and
Wenqiang Wang and Ningyi Xu and Ramashis Das and Ming
Wu and Lintao Zhang and Derek Chiou",
title = "{FlexSaaS}: a Reconfigurable Accelerator for {Web}
Search Selection",
journal = j-TRETS,
volume = "12",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301409",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3301409",
abstract = "Web search engines deploy large-scale selection
services on CPUs to identify a set of web pages that
match user queries. An FPGA-based accelerator can
exploit various levels of parallelism and provide a
lower latency, higher throughput, more energy-efficient
solution than commodity CPUs. However, maintaining such
a customized accelerator in a commercial search engine
is challenging because selection services are changed
often. This article presents our design for FlexSaaS
(Flexible Selection as a Service), an FPGA-based
accelerator for web search selection. To address
efficiency and flexibility challenges, FlexSaaS
abstracts computing models and separates memory access
from computation. Specifically, FlexSaaS (i) contains a
reconfigurable number of matching processors that can
handle various possible query plans, (ii) decouples
index stream reading from matching computation to fetch
and decode index files, and (iii) includes a universal
memory accessor that hides the complex memory hierarchy
and reduces host data access latency. Evaluated on
FPGAs in the selection service of a commercial web
search--the Bing web search engine-FlexSaaS can be
evolved quickly to adapt to new updates. Compared to
the software baseline, FlexSaaS on Arria 10 reduces
average latency by 30\% and increases throughput by
1.5$ \times $.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2019:PFF,
author = "Gai Liu and Zhiru Zhang",
title = "{PIMap}: a Flexible Framework for Improving
{LUT}-Based Technology Mapping via Parallelized
Iterative Optimization",
journal = j-TRETS,
volume = "11",
number = "4",
pages = "23:1--23:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3268344",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3268344",
abstract = "Modern FPGA synthesis tools typically apply a
predetermined sequence of logic optimizations on the
input logic network before carrying out technology
mapping. While the ``known recipes'' of logic
transformations often lead to improved mapping results,
there remains a nontrivial gap between the quality
metrics driving the pre-mapping logic optimizations and
those targeted by the actual technology mapping.
Needless to mention, such miscorrelations would
eventually result in suboptimal quality of results. In
this article, we propose PIMap, which couples logic
transformations and technology mapping under an
iterative improvement framework for LUT-based FPGAs. In
each iteration, PIMap randomly proposes a
transformation on the given logic network from an
ensemble of candidate optimizations; it then invokes
technology mapping and makes use of the mapping result
to determine the likelihood of accepting the proposed
transformation. By adjusting the optimization objective
and incorporating required time constraints during the
iterative process, PIMap can flexibly optimize for
different objectives including area minimization, delay
optimization, and delay-constrained area reduction. To
mitigate the runtime overhead, we further introduce
parallelization techniques to decompose a large design
into multiple smaller sub-netlists that can be
optimized simultaneously. Experimental results show
that PIMap achieves promising quality improvement over
a set of commonly used benchmarks, including improving
the majority of the best-known area and delay records
for the EPFL benchmark suite.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wang:2019:FBA,
author = "Haomiao Wang and Prabu Thiagaraj and Oliver Sinnen",
title = "{FPGA}-based Acceleration of {FT} Convolution for
Pulsar Search Using {OpenCL}",
journal = j-TRETS,
volume = "11",
number = "4",
pages = "24:1--24:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3268933",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3268933",
abstract = "The Square Kilometre Array (SKA) project will be the
world's largest radio telescope array. With its large
number of antennas, the number of signals that need to
be processed is dramatic. One important element of the
SKA's Central Signal Processor package is pulsar
search. This article focuses on the FPGA-based
acceleration of the Frequency-Domain Acceleration
Search module, which is a part of SKA pulsar search
engine. In this module, the frequency-domain input
signals have to be processed by 85 Finite Impulse
response (FIR) filters within a short period of
limitation and for thousands of input arrays. Because
of the large scale of the input length and FIR filter
size, even high-end FPGA devices cannot parallelise the
task completely. We start by investigating both
time-domain FIR filter (TDFIR) and frequency-domain FIR
filter (FDFIR) to tackle this task. We applied the
overlap-add algorithm to split the coefficient array of
TDFIR and the overlap-save algorithm to split the input
signals of FDFIR. To achieve fast prototyping design,
we employed OpenCL, which is a high-level FPGA
development technique. The performance and power
consumption are evaluated using multiple FPGA devices
simultaneously and compared with GPU results, which is
achieved by porting FPGA-based OpenCL kernels. The
experimental evaluation shows that the FDFIR solution
is very competitive in terms of performance, with a
clear energy consumption advantage over the GPU
solution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kroh:2019:EFG,
author = "Alexander Kroh and Oliver Diessel",
title = "Efficient Fine-grained Processor-logic Interactions on
the Cache-coherent {Zynq} Platform",
journal = j-TRETS,
volume = "11",
number = "4",
pages = "25:1--25:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3277506",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3277506",
abstract = "The introduction of cache-coherent processor-logic
interconnects in CPU--FPGA platforms promises
low-latency communication between CPU and FPGA fabrics.
This reduced latency improves the performance of
heterogeneous systems implemented on such devices and
gives rise to new software architectures that can
better use the available hardware. Via an extended
study accelerating the software task scheduler of a
microkernel operating system, this article reports on
the potential for accelerating applications that
exhibit fine-grained interactions. In doing so, we
evaluate the performance of direct and cache-coherent
communication methods for applications that involve
frequent, low-bandwidth transactions between CPU and
programmable logic. In the specific case we studied, we
found that replacing a highly optimised software
implementation of the task scheduler with an FPGA-based
scheduler reduces the cost of communication between two
software threads by 5.5\%. We also found that, while
hardware acceleration reduces cache footprint, we still
observe execution time variability because of other
non-deterministic features of the CPU.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dumpala:2019:LUE,
author = "Naveen Kumar Dumpala and Shivukumar B. Patil and
Daniel Holcomb and Russell Tessier",
title = "Loop Unrolling for Energy Efficiency in Low-Cost
Field-Programmable Gate Arrays",
journal = j-TRETS,
volume = "11",
number = "4",
pages = "26:1--26:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3289186",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3289186",
abstract = "Field-programmable gate arrays (FPGAs) are used for a
wide variety of computations in low-cost embedded
systems. Although these systems often have modest
performance constraints, their energy consumption must
typically be limited. Many FPGA applications employ
repetitive loops that cannot be straightforwardly split
into parallel computations. Performing a loop
sequentially generally requires high-speed clocks that
consume considerable clock power and sometimes require
clock generation using a phase-locked loop (PLL). Loop
unrolling addresses the high-speed clock issue, but its
use often leads to significant combinational glitch
power. In this work, a computer-aided design (CAD)
approach that unrolls loops for designs targeted to
low-cost FPGAs is described. Our approach considers
latency constraints in an effort to minimize energy
consumption for loop-based computation. To reduce
glitch power, a glitch-filtering approach is introduced
that provides a balance between glitch reduction and
design performance. Glitch-filter enable signals are
generated and routed to the filters using resources
best suited to the target FPGA. Our approach
automatically inserts glitch filters and associated
control logic into a design prior to processing with
FPGA synthesis, place, and route tools. Our
energy-saving loop-unrolling approach has been
evaluated using five benchmarks often used in low-cost
FPGAs. The energy-saving capabilities of the approach
have been evaluated for an Intel Cyclone IV and a
Xilinx Artix-7 FPGA using board-level power
measurement. The use of unrolling and glitch filtering
is shown to reduce energy by at least 65\% for an
Artix-7 device and 50\% for a Cyclone IV device while
meeting design latency constraints.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2019:EMN,
author = "Deming Chen",
title = "Editorial: a Message from the New {Editor-in-Chief}",
journal = j-TRETS,
volume = "12",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3326451",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3326451",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6e",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Morcel:2019:FAC,
author = "Raghid Morcel and Hazem Hajj and Mazen A. R. Saghir
and Haitham Akkary and Hassan Artail and Rahul Khanna
and Anil Keshavamurthy",
title = "{FeatherNet}: an Accelerated Convolutional Neural
Network Design for Resource-constrained {FPGAs}",
journal = j-TRETS,
volume = "12",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3306202",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3306202",
abstract = "Convolutional Neural Network (ConvNet or CNN)
algorithms are characterized by a large number of model
parameters and high computational complexity. These two
requirements have made it challenging for
implementations on resource-limited FPGAs. The
challenges are magnified when considering designs for
low-end FPGAs. While previous work has demonstrated
successful ConvNet implementations with high-end FPGAs,
this article presents a ConvNet accelerator design that
enables the implementation of complex deep ConvNet
architectures on resource-constrained FPGA platforms
aimed at the IoT market. We call the design
``FeatherNet'' for its light resource utilization. The
implementations are VHDL-based providing flexibility in
design optimizations. As part of the design process,
new methods are introduced to address several design
challenges. The first method is a novel stride-aware
graph-based method targeted at ConvNets that aims at
achieving efficient signal processing with reduced
resource utilization. The second method addresses the
challenge of determining the minimal precision
arithmetic needed while preserving high accuracy. For
this challenge, we propose variable-width dynamic
fixed-point representations combined with a
layer-by-layer design-space pruning heuristic across
the different layers of the deep ConvNet model. The
third method aims at achieving a modular design that
can support different types of ConvNet layers while
ensuring low resource utilization. For this challenge,
we propose the modules to be relatively small and
composed of computational filters that can be
interconnected to build an entire accelerator design.
These model elements can be easily configured through
HDL parameters (e.g., layer type, mask size, stride,
etc.) to meet the needs of specific ConvNet
implementations and thus they can be reused to
implement a wide variety of ConvNet architectures. The
fourth method addresses the challenge of design
portability between two different FPGA vendor
platforms, namely, Intel/Altera and Xilinx. For this
challenge, we propose to instantiate the
device-specific hardware blocks needed in each
computational filter, rather than relying on the
synthesis tools to infer these blocks, while keeping
track of the similarities and differences between the
two platforms. We believe that the solutions to these
design challenges further advance knowledge as they can
benefit designers and other researchers using similar
devices or facing similar challenges. Our results
demonstrated the success of addressing the design
challenges and achieving low (30\%) resource
utilization for the low-end FPGA platforms: Zedboard
and Cyclone V. The design overcame the limitation of
designs targeted for high-end platforms and that cannot
fit on low-end IoT platforms. Furthermore, our design
showed superior performance results (measured in terms
of [Frame/s/W] per Dollar) compared to high-end
optimized designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhou:2019:FAN,
author = "Xuegong Zhou and Lingli Wang and Alan Mishchenko",
title = "Fast Adjustable {NPN} Classification Using Generalized
Symmetries",
journal = j-TRETS,
volume = "12",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3313917",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3313917",
abstract = "NPN classification of Boolean functions is a powerful
technique used in many logic synthesis and technology
mapping tools in both standard cell and FPGA design
flows. Computing the canonical form is the most common
approach of Boolean function classification. This
article proposes two different hybrid NPN canonical
forms and a new algorithm to compute them. By
exploiting symmetries under different phase assignment
as well as higher-order symmetries, the search space of
NPN canonical form computation is pruned and the
runtime is dramatically reduced. Nevertheless, the
runtime for some difficult functions remains high. Fast
heuristic method can be used for such functions to
compute semi-canonical forms in a reasonable time. The
proposed algorithm can be adjusted to be a slow exact
algorithm or a fast heuristic algorithm with lower
quality. For exact NPN classification, the proposed
algorithm is 40$ \times $ faster than state-of-the-art.
For heuristic classification, the proposed algorithm
has similar performance as state-of-the-art with a
possibility to trade runtime for quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Oppermann:2019:EPM,
author = "Julian Oppermann and Melanie Reuter-Oppermann and
Lukas Sommer and Andreas Koch and Oliver Sinnen",
title = "Exact and Practical Modulo Scheduling for High-Level
Synthesis",
journal = j-TRETS,
volume = "12",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3317670",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3317670",
abstract = "Loop pipelining is an essential technique in
high-level synthesis to increase the throughput and
resource utilisation of field-programmable gate
array--based accelerators. It relies on modulo
schedulers to compute an operator schedule that allows
subsequent loop iterations to overlap partially when
executed while still honouring all precedence and
resource constraints. Modulo schedulers face a
bi-criteria problem: minimise the initiation interval
(II; i.e., the number of timesteps after which new
iterations are started) and minimise the schedule
length. We present Moovac, a novel exact formulation
that models all aspects (including the II minimisation)
of the modulo scheduling problem as a single integer
linear program, and discuss simple measures to prevent
excessive runtimes, to challenge the old preconception
that exact modulo scheduling is impractical. We
substantiate this claim by conducting an experimental
study covering 188 loops from two established
high-level synthesis benchmark suites, four different
time limits, and three bounds for the schedule length,
to compare our approach against a highly tuned exact
formulation and a state-of-the-art heuristic algorithm.
In the fastest configuration, an accumulated runtime of
under 16 minutes is spent on scheduling all loops, and
proven optimal IIs are found for 179 test instances.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bo:2019:APR,
author = "Chunkun Bo and Vinh Dang and Ted Xie and Jack Wadden
and Mircea Stan and Kevin Skadron",
title = "Automata Processing in Reconfigurable Architectures:
In-the-Cloud Deployment, Cross-Platform Evaluation, and
Fast Symbol-Only Reconfiguration",
journal = j-TRETS,
volume = "12",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3314576",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3314576",
abstract = "We present a general automata processing framework on
FPGAs, which generates an RTL kernel for automata
processing together with an AXI and PCIe based I/O
circuitry. We implement the framework on both local
nodes and cloud platforms (Amazon AWS and Nimbix) with
novel features. A full performance comparison of the
proposed framework is conducted against
state-of-the-art automata processing engines on CPUs,
GPUs, and Micron's Automata Processor using the ANMLZoo
benchmark suite and some real-world datasets. Results
show that FPGAs enable extremely high-throughput
automata processing compared to von Neumann
architectures. We also collect the resource utilization
and power consumption on the two cloud platforms, and
find that the I/O circuitry consumes most of the
hardware resources and power. Furthermore, we propose a
fast, symbol-only reconfiguration mechanism based on
the framework for large pattern sets that cannot fit on
a single device and need to be partitioned. The
proposed method supports multiple passes of the input
stream and reduces the re-compilation cost from hours
to seconds.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dinh:2019:NFI,
author = "Van Luan Dinh and Xuan Truong Nguyen and Hyuk-Jae
Lee",
title = "A Novel {FPGA} Implementation of a Time-to-Digital
Converter Supporting Run-Time Estimation and
Compensation",
journal = j-TRETS,
volume = "12",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3322482",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3322482",
abstract = "Time-to-digital converters (TDCs) are widely used in
applications that require the measurement of the time
interval between events. In previous designs using a
feedback loop and an extended delay line,
process-voltage-temperature (PVT) variation often
decreases the accuracy of measurements. To overcome the
loss of accuracy caused by PVT variation, this study
proposes a novel design of a synthesizable TDC that
employs run-time estimation and compensation of PVT
variation. A delay line consisting of a series of
buffers is used to detect the period of a ring
oscillator designed to measure the time interval
between two events. By comparing the detected period
and the system clock, the variation of the oscillation
period is compensated at run-time. The proposed TDC is
successfully implemented by using a low-cost Xilinx
Spartan-6 LX9 FPGA with a 50-MHz oscillator.
Experimental results show that the proposed TDC is
robust to PVT variation with a resolution of 19.1 ps.
In comparison with previous design, the proposed TDC
achieves about five times better tradeoff in the area,
resolution, and frequency of the reference clock.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bobda:2019:ISS,
author = "Chistophe Bobda and Ken Eguro",
title = "Introduction to the Special Section on Security in
{FPGA}-accelerated Cloud and Datacenters",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3352060",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3352060",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11e",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Giechaskiel:2019:LWE,
author = "Ilias Giechaskiel and Ken Eguro and Kasper B.
Rasmussen",
title = "Leakier Wires: Exploiting {FPGA} Long Wires for
Covert- and Side-channel Attacks",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3322483",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3322483",
abstract = "In complex FPGA designs, implementations of algorithms
and protocols from third-party sources are common.
However, the monolithic nature of FPGAs means that all
sub-circuits share common on-chip infrastructure, such
as routing resources. This presents an attack vector
for all FPGAs that contain designs from multiple
vendors, especially for FPGAs used in multi-tenant
cloud environments, or integrated into multi-core
processors. In this article, we show that ``long''
routing wires present a new source of information
leakage on FPGAs, by influencing the delay of adjacent
long wires. We show that the effect is measurable for
both static and dynamic signals and that it can be
detected using small on-board circuits. We characterize
the channel in detail and show that it is measurable
even when multiple competing circuits (including
multiple long-wire transmitters) are present and can be
replicated on different generations and families of
Xilinx devices (Virtex 5, Virtex 6, Artix 7, and
Spartan 7). We exploit the leakage to create a covert
channel with 6kbps of bandwidth and 99.9\% accuracy,
and a side channel, which can recover signals kept
constant for only 1.3s $ \mu $ s, with an accuracy of
more than 98.4\%. Finally, we propose countermeasures
to reduce the impact of this leakage.$^1$",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Krautter:2019:MEL,
author = "Jonas Krautter and Dennis R. E. Gnad and Mehdi B.
Tahoori",
title = "Mitigating Electrical-level Attacks towards Secure
Multi-Tenant {FPGAs} in the Cloud",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328222",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3328222",
abstract = "A rising trend is the use of multi-tenant FPGAs,
particularly in cloud environments, where partial
access to the hardware is given to multiple third
parties. This leads to new types of attacks in FPGAs,
which operate not only on the logic level, but also on
the electrical level through the common power delivery
network. Since FPGAs are configured from the
software-side, attackers are enabled to launch hardware
attacks from software, impacting the security of an
entire system. In this article, we show the first
attempt of a countermeasure against attacks on the
electrical level, which is based on a bitstream
checking methodology. Bitstreams are translated back
into flat technology mapped netlists, which are then
checked for properties that indicate potential
malicious runtime behavior of FPGA logic. Our approach
can provide a metric of potential risk of the FPGA
bitstream being used in active fault or passive
side-channel attacks against other users of the FPGA
fabric or the entire SoC platform.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Elrabaa:2019:PPP,
author = "Muhammad E. S. Elrabaa and Mohamed A. Al-Asli and
Marwan H. Abu-Amara",
title = "A Protection and Pay-per-use Licensing Scheme for
On-cloud {FPGA} Circuit {IPs}",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3329861",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3329861",
abstract = "Using security primitives, a novel scheme for
licensing hardware intellectual properties (HWIPs) on
Field Programmable Gate Arrays (FPGAs) in public clouds
is proposed. The proposed scheme enforces a pay-per-use
model, allows HWIP's installation only on specific
on-cloud FPGAs, and efficiently protects the HWIPs from
being cloned, reverse engineered, or used without the
owner's authorization by any party, including a cloud
insider. It also provides protection for the users'
designs integrated with the HWIP on the same FPGA. This
enables cloud tenants to license HWIPs in the cloud
from the HWIP vendors at a relatively low price based
on usage instead of paying the expensive unlimited HWIP
license fee. The scheme includes a protocol for FPGA
authentication, HWIP secure decryption, and usage by
the clients without the need for the HWIP vendor to be
involved or divulge their secret keys. A complete
prototype test-bed implementation showed that the
proposed scheme is very feasible with relatively low
resource utilization. Experiments also showed that a
HWIP could be licensed and set up in the on-cloud FPGA
in 0.9s. This is 15 times faster than setting up the
same HWIP from outside the cloud, which takes about 14s
based on the average global Internet speed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2019:RAD,
author = "Jiliang Zhang and Gang Qu",
title = "Recent Attacks and Defenses on {FPGA}-based Systems",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3340557",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3340557",
abstract = "Field-programmable gate array (FPGA) is a kind of
programmable chip that is widely used in many areas,
including automotive electronics, medical devices,
military and consumer electronics, and is gaining more
popularity. Unlike the application specific integrated
circuits (ASIC) design, an FPGA-based system has its
own supply-chain model and design flow, which brings
interesting security and trust challenges. In this
survey, we review the security and trust issues related
to FPGA-based systems from the market perspective,
where we model the market with the following parties:
FPGA vendors, foundries, IP vendors, EDA tool vendors,
FPGA-based system developers, and end-users. For each
party, we show the security and trust problems they
need to be aware of and the associated solutions that
are available. We also discuss some challenges and
opportunities in the security and trust of FPGA-based
systems used in large-scale cloud and datacenters.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Umuroglu:2019:OBS,
author = "Yaman Umuroglu and Davide Conficconi and Lahiru
Rasnayake and Thomas B. Preusser and Magnus
Sj{\"a}lander",
title = "Optimizing Bit-Serial Matrix Multiplication for
Reconfigurable Computing",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337929",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3337929",
abstract = "Matrix--matrix multiplication is a key computational
kernel for numerous applications in science and
engineering, with ample parallelism and data locality
that lends itself well to high-performance
implementations. Many matrix multiplication-dependent
applications can use reduced-precision integer or
fixed-point representations to increase their
performance and energy efficiency while still offering
adequate quality of results. However, precision
requirements may vary between different application
phases or depend on input data, rendering
constant-precision solutions ineffective. BISMO, a
vectorized bit-serial matrix multiplication overlay for
reconfigurable computing, previously utilized the
excellent binary-operation performance of FPGAs to
offer a matrix multiplication performance that scales
with required precision and parallelism. We show how
BISMO can be scaled up on Xilinx FPGAs using an
arithmetic architecture that better utilizes six-input
LUTs. The improved BISMO achieves a peak performance of
15.4 binary TOPS on the Ultra96 board with a Xilinx
UltraScale+ MPSoC.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Al-Hyari:2019:NCE,
author = "Abeer Al-Hyari and Ziad Abuowaimer and Timothy Martin
and Gary Gr{\'e}wal and Shawki Areibi and Anthony
Vannelli",
title = "Novel Congestion-estimation and Routability-prediction
Methods based on Machine Learning for Modern {FPGAs}",
journal = j-TRETS,
volume = "12",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337930",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3337930",
abstract = "Effectively estimating and managing congestion during
placement can save substantial placement and routing
runtime. In this article, we present a machine-learning
model for accurately and efficiently estimating
congestion during FPGA placement. Compared with the
state-of-the-art machine-learning congestion-estimation
model, our results show a 25\% improvement in
prediction accuracy. This makes our model competitive
with congestion estimates produced using a global
router. However, our model runs, on average, 291$
\times $ faster than the global router. Overall, we are
able to reduce placement runtimes by 17\% and router
runtimes by 19\%. An additional machine-learning model
is also presented that uses the output of the first
congestion-estimation model to determine whether or not
a placement is routable. This second model has an
accuracy in the range of 93\% to 98\%, depending on the
classification algorithm used to implement the learning
model, and runtimes of a few milliseconds, thus making
it suitable for inclusion in any placer with no worry
of additional computational overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Owaida:2019:DID,
author = "Muhsen Owaida and Amit Kulkarni and Gustavo Alonso",
title = "Distributed Inference over Decision Tree Ensembles on
Clusters of {FPGAs}",
journal = j-TRETS,
volume = "12",
number = "4",
pages = "17:1--17:??",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3340263",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3340263",
abstract = "Given the growth in data inputs and application
complexity, it is often the case that a single hardware
accelerator is not enough to solve a given problem. In
particular, the computational demands and I/O of many
tasks in machine learning often require a cluster of
accelerators to make a relevant difference in
performance. In this article, we explore the efficient
construction of FPGA clusters using inference over
Decision Tree Ensembles as the target application. The
article explores several levels of the problem: (1) a
lightweight inter-FPGA communication protocol and
routing layer to facilitate the communication between
the different FPGAs, (2) the data partitioning and
distribution strategies maximizing performance, (3) and
an in depth analysis on how applications can be
efficiently distributed over such a cluster. The
experimental analysis shows that the resulting system
can support inference over decision tree ensembles at a
significantly higher throughput than that achieved by
existing systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ahmed:2019:FAB,
author = "Ibrahim Ahmed and Shuze Zhao and James Meijers and
Olivier Trescases and Vaughn Betz",
title = "{FRoC 2.0}: Automatic {BRAM} and Logic Testing to
Enable Dynamic Voltage Scaling for {FPGA}
Applications",
journal = j-TRETS,
volume = "12",
number = "4",
pages = "20:1--20:??",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3354188",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3354188",
abstract = "In earlier technology nodes, FPGAs had low power
consumption compared to other compute chips such as
CPUs and GPUs. However, in the 14nm technology node,
FPGAs are consuming unprecedented power in the 100+W
range, making power consumption a pressing concern. To
reduce FPGA power consumption, several researchers have
proposed deploying dynamic voltage scaling. While the
previously proposed solutions show promising results,
they have difficulty guaranteeing safe operation at
reduced voltages for applications that use the FPGA
hard blocks. In this work, we present the first DVS
solution that is able to fully handle FPGA applications
that use BRAMs. Our solution not only robustly tests
the soft logic component of the application but also
tests all components connected to the BRAMs. We extend
a previously proposed CAD tool, FRoC, to automatically
generate calibration bitstreams that are used to
measure the application's critical path delays on
silicon. The calibration bitstreams also include
testers that ensure all used SRAM cells operate safely
while scaling V$_{dd}$. We experimentally show that
using our DVS solution we can save 32\% of the total
power consumed by a discrete Fourier transform
application running with the fixed nominal supply
voltage and clocked at the F$_{max}$ reported by static
timing analysis.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tridgell:2019:UTN,
author = "Stephen Tridgell and Martin Kumm and Martin Hardieck
and David Boland and Duncan Moss and Peter Zipf and
Philip H. W. Leong",
title = "Unrolling Ternary Neural Networks",
journal = j-TRETS,
volume = "12",
number = "4",
pages = "22:1--22:??",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3359983",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Oct 19 17:43:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
abstract = "The computational complexity of neural networks for
large-scale or real-time applications necessitates
hardware acceleration. Most approaches assume that the
network architecture and parameters are unknown at
design time, permitting usage in a large number of
applications. This article demonstrates, for the case
where the neural network architecture and ternary
weight values are known a priori, that extremely high
throughput implementations of neural network inference
can be made by customising the datapath and routing to
remove unnecessary computations and data movement. This
approach is ideally suited to FPGA implementations as a
specialized implementation of a trained network
improves efficiency while still retaining generality
with the reconfigurability of an FPGA. A VGG-style
network with ternary weights and fixed point
activations is implemented for the CIFAR10 dataset on
Amazon's AWS F1 instance. This article demonstrates how
to remove 90\% of the operations in convolutional
layers by exploiting sparsity and compile-time
optimizations. The implementation in hardware achieves
90.9 \pm 0.1\% accuracy and 122k frames per second,
with a latency of only 29\micro s, which is the fastest
CNN inference implementation reported so far on an
FPGA.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Serre:2020:DBH,
author = "Fran{\c{c}}ois Serre and Markus P{\"u}schel",
title = "{DSL}-Based Hardware Generation with {Scala}: Example
{Fast Fourier Transforms} and Sorting Networks",
journal = j-TRETS,
volume = "13",
number = "1",
pages = "1:1--1:23",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3359754",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Feb 6 08:37:52 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3359754",
abstract = "We present a hardware generator for computations with
regular structure including the fast Fourier transform
(FFT), sorting networks, and others. The input of the
generator is a high-level description of the algorithm;
the output is a token-based, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Alachiotis:2020:RXF,
author = "Nikolaos Alachiotis and Charalampos Vatsolakis and
Grigorios Chrysos and Dionisios Pnevmatikatos",
title = "{RAiSD-X}: a Fast and Accurate {FPGA} System for the
Detection of Positive Selection in Thousands of
Genomes",
journal = j-TRETS,
volume = "13",
number = "1",
pages = "2:1--2:30",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3364225",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Feb 6 08:37:52 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3364225",
abstract = "Detecting traces of positive selection in genomes
carries theoretical significance and has practical
applications from shedding light on the forces that
drive adaptive evolution to the design of more
effective drug treatments. The size of genomic
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Attia:2020:FFI,
author = "Sameh Attia and Vaughn Betz",
title = "Feel Free to Interrupt: Safe Task Stopping to Enable
{FPGA} Checkpointing and Context Switching",
journal = j-TRETS,
volume = "13",
number = "1",
pages = "3:1--3:27",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3372491",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Feb 6 08:37:52 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3372491",
abstract = "Saving and restoring an FPGA task state in an orderly
manner is essential to enable hardware checkpointing,
which is highly desirable to improve the ability to
debug cloud-scale hardware services, and context
switching, which allows multiple users to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jamal:2020:FTH,
author = "Al-Shahna Jamal and Eli Cahill and Jeffrey Goeders and
Steven J. E. Wilton",
title = "Fast Turnaround {HLS} Debugging Using Dependency
Analysis and Debug Overlays",
journal = j-TRETS,
volume = "13",
number = "1",
pages = "4:1--4:26",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3372490",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Feb 6 08:37:52 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3372490",
abstract = "High-level synthesis (HLS) has gained considerable
traction over recent years, as it allows for faster
development and verification of hardware accelerators
than traditional RTL design. While HLS allows for most
bugs to be caught during software \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kourfali:2020:CDD,
author = "Alexandra Kourfali and Dirk Stroobandt",
title = "In-Circuit Debugging with Dynamic Reconfiguration of
{FPGA} Interconnects",
journal = j-TRETS,
volume = "13",
number = "1",
pages = "5:1--5:29",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3375459",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Feb 6 08:37:52 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3375459",
abstract = "In this work, a novel method for in-circuit debugging
on FPGAs is introduced that allows the insertion of
low-overhead debugging infrastructure by exploiting the
technique of parameterized configurations. This allows
the parameterization of the LUTs and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Garg:2020:HNC,
author = "Tushar Garg and Saud Wasly and Rodolfo Pellizzoni and
Nachiket Kapre",
title = "{HopliteBuf}: Network Calculus-Based Design of {FPGA
NoCs} with Provably Stall-Free {FIFOs}",
journal = j-TRETS,
volume = "13",
number = "2",
pages = "6:1--6:35",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3375899",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jun 11 15:19:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3375899",
abstract = "HopliteBuf is a deflection-free, low-cost, and
high-speed FPGA overlay Network-on-chip (NoC) with
stall-free buffers. It is an FPGA-friendly 2D
unidirectional torus topology built on top of HopliteRT
overlay NoC. The stall-free buffers in HopliteBuf are
\ldots{}.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fraser:2020:KNL,
author = "Nicholas J. Fraser and Philip H. W. Leong",
title = "Kernel Normalised Least Mean Squares with Delayed
Model Adaptation",
journal = j-TRETS,
volume = "13",
number = "2",
pages = "7:1--7:30",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3376924",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jun 11 15:19:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3376924",
abstract = "Kernel adaptive filters (KAFs) are non-linear filters
which can adapt temporally and have the additional
benefit of being computationally efficient through use
of the ``kernel trick''. In a number of real-world
applications, such as channel equalisation, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Besta:2020:SCM,
author = "Maciej Besta and Marc Fischer and Tal Ben-Nun and
Dimitri Stanojevic and Johannes De Fine Licht and
Torsten Hoefler",
title = "Substream-Centric Maximum Matchings on {FPGA}",
journal = j-TRETS,
volume = "13",
number = "2",
pages = "8:1--8:33",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377871",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jun 11 15:19:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377871",
abstract = "Developing high-performance and energy-efficient
algorithms for maximum matchings is becoming
increasingly important in social network analysis,
computational sciences, scheduling, and others. In this
work, we propose the first maximum matching \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Murray:2020:VHP,
author = "Kevin E. Murray and Oleg Petelin and Sheng Zhong and
Jia Min Wang and Mohamed Eldafrawy and Jean-Philippe
Legault and Eugene Sha and Aaron G. Graham and Jean Wu
and Matthew J. P. Walker and Hanqing Zeng and
Panagiotis Patros and Jason Luu and Kenneth B. Kent and
Vaughn Betz",
title = "{VTR 8}: High-performance {CAD} and Customizable
{FPGA} Architecture Modelling",
journal = j-TRETS,
volume = "13",
number = "2",
pages = "9:1--9:55",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3388617",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jun 11 15:19:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3388617",
abstract = "Developing Field-programmable Gate Array (FPGA)
architectures is challenging due to the competing
requirements of various application domains and
changing manufacturing process technology. This is
compounded by the difficulty of fairly evaluating FPGA
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Delomier:2020:MBD,
author = "Yann Delomier and Bertrand {Le Gal} and Jer{\'e}mie
Crenne and Christophe Jego",
title = "Model-based Design of Hardware {SC} Polar Decoders for
{FPGAs}",
journal = j-TRETS,
volume = "13",
number = "2",
pages = "10:1--10:27",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3391431",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Thu Jun 11 15:19:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3391431",
abstract = "Polar codes are a new error correction code family
that should be benchmarked and evaluated in comparison
to LDPC and turbo-codes. Indeed, recent advances in the
5G digital communication standard recommended the use
of polar codes in EMBB control \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shao:2020:PGF,
author = "Zhiyuan Shao and Chenhao Liu and Ruoshi Li and Xiaofei
Liao and Hai Jin",
title = "Processing Grid-format Real-world Graphs on
{DRAM}-based {FPGA} Accelerators with
Application-specific Caching Mechanisms",
journal = j-TRETS,
volume = "13",
number = "3",
pages = "11:1--11:33",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3391920",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Sep 5 18:51:36 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3391920",
abstract = "Graph processing is one of the important research
topics in the big-data era. To build a general
framework for graph processing by using a DRAM-based
FPGA board with deep memory hierarchy, one of the
reasonable methods is to partition a given big graph
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Eldafrawy:2020:FLB,
author = "Mohamed Eldafrawy and Andrew Boutros and Sadegh
Yazdanshenas and Vaughn Betz",
title = "{FPGA} Logic Block Architectures for Efficient Deep
Learning Inference",
journal = j-TRETS,
volume = "13",
number = "3",
pages = "12:1--12:34",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3393668",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Sep 5 18:51:36 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3393668",
abstract = "Reducing the precision of deep neural network (DNN)
inference accelerators can yield large efficiency gains
with little or no accuracy degradation compared to half
or single precision floating-point by enabling more
multiplication operations per unit \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mu:2020:OOB,
author = "Jiandong Mu and Wei Zhang and Hao Liang and Sharad
Sinha",
title = "Optimizing {OpenCL}-Based {CNN} Design on {FPGA} with
Comprehensive Design Space Exploration and
Collaborative Performance Modeling",
journal = j-TRETS,
volume = "13",
number = "3",
pages = "13:1--13:28",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3397514",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Sep 5 18:51:36 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3397514",
abstract = "Recent success in applying convolutional neural
networks (CNNs) to object detection and classification
has sparked great interest in accelerating CNNs using
hardware-like field-programmable gate arrays (FPGAs).
However, finding an efficient FPGA design \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sabogal:2020:RFE,
author = "Sebastian Sabogal and Alan George and Christopher
Wilson",
title = "Reconfigurable Framework for Environmentally Adaptive
Resilience in Hybrid Space Systems",
journal = j-TRETS,
volume = "13",
number = "3",
pages = "14:1--14:32",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3398380",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Sep 5 18:51:36 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3398380",
abstract = "Due to ongoing innovations in both sensor technology
and spacecraft autonomy, onboard space processing
continues to be outpaced by the escalating
computational demands required for next-generation
missions. Commercial-off-the-shelf, hybrid system-on-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{La:2020:FMS,
author = "Tuan Minh La and Kaspar Matas and Nikola Grunchevski
and Khoa Dang Pham and Dirk Koch",
title = "{FPGADefender}: Malicious Self-oscillator Scanning for
{Xilinx UltraScale} + {FPGAs}",
journal = j-TRETS,
volume = "13",
number = "3",
pages = "15:1--15:31",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3402937",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Sep 5 18:51:36 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3402937",
abstract = "Sharing configuration bitstreams rather than netlists
is a very desirable feature to protect IP or to share
IP without longer CAD tool processing times.
Furthermore, an increasing number of systems could
hugely benefit from serving multiple users on the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tang:2020:PSM,
author = "Qi Tang and Zhe Wang and Biao Guo and Li-Hua Zhu and
Ji-Bo Wei",
title = "Partitioning and Scheduling with Module Merging on
Dynamic Partial Reconfigurable {FPGAs}",
journal = j-TRETS,
volume = "13",
number = "3",
pages = "16:1--16:24",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3403702",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Sep 5 18:51:36 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3403702",
abstract = "Field programmable gate array (FPGA) is ubiquitous
nowadays and is applied to many areas. Dynamic partial
reconfiguration (DPR) is introduced to most modern
FPGAs, enabling changing the function of a part of the
FPGA by dynamically loading new \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dehon:2020:ISS,
author = "Andr{\'e} Dehon",
title = "Introduction to Special Section on {FCCM 2019}",
journal = j-TRETS,
volume = "13",
number = "4",
pages = "17:1--17:2",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3410373",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 2 07:58:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3410373",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhou:2020:AFR,
author = "Yun Zhou and Dries Vercruyce and Dirk Stroobandt",
title = "Accelerating {FPGA} Routing Through Algorithmic
Enhancements and Connection-aware Parallelization",
journal = j-TRETS,
volume = "13",
number = "4",
pages = "18:1--18:26",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3406959",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 2 07:58:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3406959",
abstract = "Routing is a crucial step in Field Programmable Gate
Array (FPGA) physical design, as it determines the
routes of signals in the circuit, which impacts the
design implementation quality significantly. It can be
very time-consuming to successfully route \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2020:MRB,
author = "Jialiang Zhang and Yue Zha and Nicholas Beckwith and
Bangya Liu and Jing Li",
title = "{MEG}: a {RISCV}-based System Emulation Infrastructure
for Near-data Processing Using {FPGAs} and
High-bandwidth Memory",
journal = j-TRETS,
volume = "13",
number = "4",
pages = "19:1--19:24",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409114",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 2 07:58:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3409114",
abstract = "Emerging three-dimensional (3D) memory technologies,
such as the Hybrid Memory Cube (HMC) and High Bandwidth
Memory (HBM), provide high-bandwidth and massive
memory-level parallelism. With the growing
heterogeneity and complexity of computer systems
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Vaishnav:2020:FMF,
author = "Anuj Vaishnav and Khoa Dang Pham and Joseph Powell and
Dirk Koch",
title = "{FOS}: a Modular {FPGA} Operating System for Dynamic
Workloads",
journal = j-TRETS,
volume = "13",
number = "4",
pages = "20:1--20:28",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3405794",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 2 07:58:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3405794",
abstract = "With FPGAs now being deployed in the cloud and at the
edge, there is a need for scalable design methods that
can incorporate the heterogeneity present in the
hardware and software components of FPGA systems.
Moreover, these FPGA systems need to be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ioannou:2020:UNA,
author = "Aggelos D. Ioannou and Konstantinos Georgopoulos and
Pavlos Malakonakis and Dionisios N. Pnevmatikatos and
Vassilis D. Papaefstathiou and Ioannis Papaefstathiou
and Iakovos Mavroidis",
title = "{UNILOGIC}: a Novel Architecture for Highly Parallel
Reconfigurable Systems",
journal = j-TRETS,
volume = "13",
number = "4",
pages = "21:1--21:32",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409115",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Oct 2 07:58:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3409115",
abstract = "One of the main characteristics of High-performance
Computing (HPC) applications is that they become
increasingly performance and power demanding, pushing
HPC systems to their limits. Existing HPC systems have
not yet reached exascale performance mainly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2021:CHP,
author = "Xuzhi Zhang and Xiaozhe Shao and George Provelengios
and Naveen Kumar Dumpala and Lixin Gao and Russell
Tessier",
title = "{CoNFV}: a Heterogeneous Platform for Scalable Network
Function Virtualization",
journal = j-TRETS,
volume = "14",
number = "1",
pages = "1:1--1:29",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409113",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:04 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
URL = "https://dl.acm.org/doi/10.1145/3409113",
abstract = "Network function virtualization (NFV) is a powerful
networking approach that leverages computing resources
to perform a time-varying set of network processing
functions. Although microprocessors can be used for
this purpose, their performance limitations and lack of
specialization present implementation challenges. In
this article, we describe a new heterogeneous
hardware-software NFV platform called CoNFV that
provides scalability and programmability while
supporting significant hardware-level parallelism and
reconfiguration. Our computing platform takes advantage
of both field-programmable gate arrays (FPGAs) and
microprocessors to implement numerous virtual network
functions (VNF) that can be dynamically customized to
specific network flow needs. The most distinctive
feature of our system is the use of global network
state to coordinate NFV operations. Traffic management
and hardware reconfiguration functions are performed by
a global coordinator that allows for the rapid sharing
of network function states and continuous evaluation of
network function needs. With the help of state sharing
mechanism offered by the coordinator, customer-defined
VNF instances can be easily migrated between
heterogeneous middleboxes as the network environment
changes. A resource allocation and scheduling algorithm
dynamically assesses resource deployments as network
flows and conditions are updated. We show that our
deployment algorithm can successfully reallocate FPGA
and microprocessor resources in a fraction of a second
in response to changes in network flow capacity and
network security threats including intrusion.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Beasley:2021:OCH,
author = "Alexander E. Beasley and C. T. Clarke and R. J.
Watson",
title = "An {OpenGL} Compliant Hardware Implementation of a
Graphic Processing Unit Using Field Programmable Gate
Array-System on Chip Technology",
journal = j-TRETS,
volume = "14",
number = "1",
pages = "2:1--2:24",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3410357",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:04 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3410357",
abstract = "FPGA-SoC technology provides a heterogeneous platform
for advanced, high-performance systems. The System on
Chip (SoC) architecture combines traditional single and
multiple core processor topologies with flexible FPGA
fabric. Dynamic reconfiguration \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kara:2021:PGC,
author = "Kaan Kara and Gustavo Alonso",
title = "{PipeArch}: Generic and Context-Switch Capable Data
Processing on {FPGAs}",
journal = j-TRETS,
volume = "14",
number = "1",
pages = "3:1--3:28",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418465",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:04 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3418465",
abstract = "Data processing systems based on FPGAs offer high
performance and energy efficiency for a variety of
applications. However, these advantages are achieved
through highly specialized designs. The high degree of
specialization leads to accelerators with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mohajer:2021:PUC,
author = "Soheil Mohajer and Zhiheng Wang and Kia Bazargan and
Yuyang Li",
title = "Parallel Unary Computing Based on Function
Derivatives",
journal = j-TRETS,
volume = "14",
number = "1",
pages = "4:1--4:25",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418464",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:04 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3418464",
abstract = "The binary number representation has dominated digital
logic for decades due to its compact storage
requirements. An alternative representation is the
unary number system: We use N bits, from which the
first M are 1 and the rest are 0 to represent the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kyparissas:2021:LSC,
author = "Nikolaos Kyparissas and Apostolos Dollas",
title = "Large-scale Cellular Automata on {FPGAs}: a New
Generic Architecture and a Framework",
journal = j-TRETS,
volume = "14",
number = "1",
pages = "5:1--5:32",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3423185",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:04 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3423185",
abstract = "Cellular automata (CA) are discrete mathematical
models discovered in the 1940s by John von Neumann and
Stanislaw Ulam and have been used extensively in many
scientific disciplines ever since. The present work
evolved from a Field Programmable Gate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Peetermans:2021:DAC,
author = "Adriaan Peetermans and Vladimir Rozi{\'c} and Ingrid
Verbauwhede",
title = "Design and Analysis of Configurable Ring Oscillators
for True Random Number Generation Based on Coherent
Sampling",
journal = j-TRETS,
volume = "14",
number = "2",
pages = "7:1--7:20",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3433166",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:05 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3433166",
abstract = "True Random Number Generators (TRNGs) are
indispensable in modern cryptosystems. Unfortunately,
to guarantee high entropy of the generated numbers,
many TRNG designs require a complex implementation
procedure, often involving manual placement and
routing. In this work, we introduce, analyse, and
compare three dynamic calibration mechanisms for the
COherent Sampling ring Oscillator based TRNG: GateVar,
WireVar, and LUTVar, enabling easy integration of the
entropy source into complex systems. The TRNG setup
procedure automatically selects a configuration that
guarantees the security requirements. In the
experiments, we show that two out of the three proposed
mechanisms are capable of assuring correct TRNG
operation even when an automatic placement is carried
out and when the design is ported to another
Field-Programmable Gate Array (FPGA) family. We
generated random bits on both a Xilinx Spartan 7 and a
Microsemi SmartFusion2 implementation that, without
post processing, passed the AIS-31 statistical tests at
a throughput of 4.65 Mbit/s and 1.47 Mbit/s,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cho:2021:PMC,
author = "Shenghsun Cho and Mrunal Patel and Michael Ferdman and
Peter Milder",
title = "Practical Model Checking on {FPGAs}",
journal = j-TRETS,
volume = "14",
number = "2",
pages = "8:1--8:18",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3448272",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:05 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3448272",
abstract = "Software verification is an important stage of the
software development process, particularly for
mission-critical systems. As the traditional
methodology of using unit tests falls short of
verifying complex software, developers are increasingly
relying \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ma:2021:SFP,
author = "Rui Ma and Jia-Ching Hsu and Tian Tan and Eriko
Nurvitadhi and David Sheffield and Rob Pelt and Martin
Langhammer and Jaewoong Sim and Aravind Dasu and Derek
Chiou",
title = "Specializing {FGPU} for Persistent Deep Learning",
journal = j-TRETS,
volume = "14",
number = "2",
pages = "10:1--10:23",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457886",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:05 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3457886",
abstract = "Overlay architectures are a good way to enable fast
development and debug on FPGAs at the expense of
potentially limited performance compared to fully
customized FPGA designs. When used in concert with
hand-tuned FPGA solutions, performant overlay
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhou:2021:SHC,
author = "Zhen Zhou and Debiao He and Zhe Liu and Min Luo and
Kim-Kwang Raymond Choo",
title = "A Software\slash Hardware Co-Design of
{Crystals-Dilithium} Signature Scheme",
journal = j-TRETS,
volume = "14",
number = "2",
pages = "11:1--11:21",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447812",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jul 16 07:17:05 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3447812",
abstract = "As quantum computers become more affordable and
commonplace, existing security systems that are based
on classical cryptographic primitives, such as RSA and
Elliptic Curve Cryptography (ECC), will no longer be
secure. Hence, there has been interest in designing
post-quantum cryptographic (PQC) schemes, such as those
based on lattice-based cryptography (LBC). The
potential of LBC schemes is evidenced by the number of
such schemes passing the selection of NIST PQC
Standardization Process Round-3. One such scheme is the
Crystals-Dilithium signature scheme, which is based on
the hard module-lattice problem. However, there is no
efficient implementation of the Crystals-Dilithium
signature scheme. Hence, in this article, we present a
compact hardware architecture containing elaborate
modular multiplication units using the Karatsuba
algorithm along with smart generators of address
sequence and twiddle factors for NTT, which can
complete polynomial addition/multiplication with the
parameter setting of Dilithium in a short clock period.
Also, we propose a fast software/hardware co-design
implementation on Field Programmable Gate Array (FPGA)
for the Dilithium scheme with a tradeoff between speed
and resource utilization. Our co-design implementation
outperforms a pure C implementation on a Nios-II
processor of the platform Altera DE2-115, in the sense
that our implementation is 11.2 and 7.4 times faster
for signature and verification, respectively. In
addition, we also achieve approximately 51\% and 31\%
speed improvement for signature and verification, in
comparison to the pure C implementation on processor
ARM Cortex-A9 of ZYNQ-7020 platform.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yasudo:2021:APE,
author = "Ryota Yasudo and Jos{\'e} G. F. Coutinho and Ana-Lucia
Varbanescu and Wayne Luk and Hideharu Amano and Tobias
Becker and Ce Guo",
title = "Analytical Performance Estimation for Large-Scale
Reconfigurable Dataflow Platforms",
journal = j-TRETS,
volume = "14",
number = "3",
pages = "12:1--12:21",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3452742",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 21 07:50:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3452742",
abstract = "Next-generation high-performance computing platforms
will handle extreme data- and compute-intensive
problems that are intractable with today's technology.
A promising path in achieving the next leap in
high-performance computing is to embrace \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Taka:2021:PVA,
author = "Endri Taka and Konstantinos Maragos and George
Lentaris and Dimitrios Soudris",
title = "Process Variability Analysis in Interconnect, Logic,
and Arithmetic Blocks of 16-nm {FinFET FPGAs}",
journal = j-TRETS,
volume = "14",
number = "3",
pages = "13:1--13:30",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458843",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 21 07:50:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3458843",
abstract = "In the current work, we study the process variability
of logic, interconnect, and arithmetic/DSP resources in
commercial 16-nm FPGAs. We create multiple, soft-macro
sensors for each distinct resource under evaluation,
and we deploy them across the FPGA \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sasongko:2021:HCS,
author = "Arif Sasongko and I. M. Narendra Kumara and Arief
Wicaksana and Fr{\'e}d{\'e}ric Rousseau and Olivier
Muller",
title = "Hardware Context Switch-based Cryptographic
Accelerator for Handling Multiple Streams",
journal = j-TRETS,
volume = "14",
number = "3",
pages = "14:1--14:25",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460941",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 21 07:50:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3460941",
abstract = "The confidentiality and integrity of a stream has
become one of the biggest issues in telecommunication.
The best available algorithm handling the
confidentiality of a data stream is the symmetric key
block cipher combined with a chaining mode of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Reggiani:2021:ESM,
author = "Enrico Reggiani and Emanuele {Del Sozzo} and Davide
Conficconi and Giuseppe Natale and Carlo Moroni and
Marco D. Santambrogio",
title = "Enhancing the Scalability of Multi-{FPGA} Stencil
Computations via Highly Optimized {HDL} Components",
journal = j-TRETS,
volume = "14",
number = "3",
pages = "15:1--15:33",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3461478",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 21 07:50:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3461478",
abstract = "Stencil-based algorithms are a relevant class of
computational kernels in high-performance systems, as
they appear in a plethora of fields, from image
processing to seismic simulations, from numerical
methods to physical modeling. Among the various
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Al-Hyari:2021:DLF,
author = "Abeer Al-Hyari and Hannah Szentimrey and Ahmed Shamli
and Timothy Martin and Gary Gr{\'e}wal and Shawki
Areibi",
title = "A Deep Learning Framework to Predict Routability for
{FPGA} Circuit Placement",
journal = j-TRETS,
volume = "14",
number = "3",
pages = "16:1--16:28",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3465373",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 21 07:50:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3465373",
abstract = "The ability to accurately and efficiently estimate the
routability of a circuit based on its placement is one
of the most challenging and difficult tasks in the
Field Programmable Gate Array (FPGA) flow. In this
article, we present a novel, deep learning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lai:2021:PSS,
author = "Yi-Hsiang Lai and Ecenur Ustun and Shaojie Xiang and
Zhenman Fang and Hongbo Rong and Zhiru Zhang",
title = "Programming and Synthesis for Software-defined {FPGA}
Acceleration: Status and Future Prospects",
journal = j-TRETS,
volume = "14",
number = "4",
pages = "17:1--17:39",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3469660",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Sep 21 07:21:30 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3469660",
abstract = "FPGA-based accelerators are increasingly popular
across a broad range of applications, because they
offer massive parallelism, high energy efficiency, and
great flexibility for customizations. However,
difficulties in programming and integrating FPGAs
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yang:2021:BWB,
author = "Tao Yang and Zhezhi He and Tengchuan Kou and Qingzheng
Li and Qi Han and Haibao Yu and Fangxin Liu and Yun
Liang and Li Jiang",
title = "{BISWSRBS}: a {Winograd}-based {CNN} Accelerator with
a Fine-grained Regular Sparsity Pattern and Mixed
Precision Quantization",
journal = j-TRETS,
volume = "14",
number = "4",
pages = "18:1--18:28",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3467476",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Sep 21 07:21:30 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3467476",
abstract = "Field-programmable Gate Array (FPGA) is a
high-performance computing platform for Convolution
Neural Networks (CNNs) inference. Winograd algorithm,
weight pruning, and quantization are widely adopted to
reduce the storage and arithmetic overhead of CNNs
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wijtvliet:2021:CER,
author = "Mark Wijtvliet and Henk Corporaal and Akash Kumar",
title = "{CGRA-EAM-Rapid} Energy and Area Estimation for
Coarse-grained Reconfigurable Architectures",
journal = j-TRETS,
volume = "14",
number = "4",
pages = "19:1--19:28",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468874",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Sep 21 07:21:30 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3468874",
abstract = "Reconfigurable architectures are quickly gaining in
popularity due to their flexibility and ability to
provide high energy efficiency. However, reconfigurable
systems allow for a huge design space. Iterative design
space exploration (DSE) is often \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gu:2021:DGB,
author = "Zhenghua Gu and Wenqing Wan and Jundong Xie and Chang
Wu",
title = "Dependency Graph-based High-level Synthesis for
Maximum Instruction Parallelism",
journal = j-TRETS,
volume = "14",
number = "4",
pages = "20:1--20:15",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468875",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Sep 21 07:21:30 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3468875",
abstract = "Performance optimization is an important goal for
High-level Synthesis (HLS). Existing HLS scheduling
algorithms are all based on Control and Data Flow Graph
(CDFG) and will schedule basic blocks in sequential
order. Our study shows that the sequential \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hung:2021:AGF,
author = "Jos{\'e} Romero Hung and Chao Li and Pengyu Wang and
Chuanming Shao and Jinyang Guo and Jing Wang and
Guoyong Shi",
title = "{ACE-GCN}: a Fast Data-driven {FPGA} Accelerator for
{GCN} Embedding",
journal = j-TRETS,
volume = "14",
number = "4",
pages = "21:1--21:23",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3470536",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Sep 21 07:21:30 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3470536",
abstract = "ACE-GCN is a fast and resource/energy-efficient FPGA
accelerator for graph convolutional embedding under
data-driven and in-place processing conditions. Our
accelerator exploits the inherent power law
distribution and high sparsity commonly exhibited by
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sabogal:2021:RFR,
author = "Sebastian Sabogal and Alan George and Gary Crum",
title = "Reconfigurable Framework for Resilient Semantic
Segmentation for Space Applications",
journal = j-TRETS,
volume = "14",
number = "4",
pages = "22:1--22:32",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472770",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Sep 21 07:21:30 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3472770",
abstract = "Deep learning (DL) presents new opportunities for
enabling spacecraft autonomy, onboard analysis, and
intelligent applications for space missions. However,
DL applications are computationally intensive and often
infeasible to deploy on radiation-hardened \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shannon:2022:ISS,
author = "Lesley Shannon",
title = "Introduction to Special Section on {FPGA 2020}",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "1:1--1:2",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485586",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3485586",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rybalkin:2022:WMG,
author = "Vladimir Rybalkin and Jonas Ney and Menbere Kina
Tekleyohannes and Norbert Wehn",
title = "When Massive {GPU} Parallelism Ain't Enough: a Novel
Hardware Architecture of {$2$D-LSTM} Neural Network",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "2:1--2:35",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3469661",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3469661",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Papaphilippou:2022:HHP,
author = "Philippos Papaphilippou and Jiuxi Meng and Nadeen
Gebara and Wayne Luk",
title = "{Hipernetch}: High-Performance {FPGA} Network Switch",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "3:1--3:31",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477054",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3477054",
abstract = "We present Hipernetch, a novel FPGA-based design for
performing high-bandwidth network switching. FPGAs have
recently become more popular in data centers due to
their promising capabilities for a wide range of
applications. With the recent surge in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Josipovic:2022:BPS,
author = "Lana Josipovi{\'c} and Shabnam Sheikhha and Andrea
Guerrieri and Paolo Ienne and Jordi Cortadella",
title = "Buffer Placement and Sizing for High-Performance
Dataflow Circuits",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "4:1--4:32",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477053",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3477053",
abstract = "Commercial high-level synthesis tools typically
produce statically scheduled circuits. Yet, effective
C-to-circuit conversion of arbitrary software
applications calls for dataflow circuits, as they can
handle efficiently variable latencies (e.g., caches),
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gross:2022:ESF,
author = "Mathieu Gross and Konrad Hohentanner and Stefan
Wiehler and Georg Sigl",
title = "Enhancing the Security of {FPGA-SoCs} via the Usage of
{ARM TrustZone} and a Hybrid-{TPM}",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "5:1--5:26",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472959",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3472959",
abstract = "Isolated execution is a concept commonly used for
increasing the security of a computer system. In the
embedded world, ARM TrustZone technology enables this
goal and is currently used on mobile devices for
applications such as secure payment or biometric
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wu:2022:LPF,
author = "Chen Wu and Mingyu Wang and Xinyuan Chu and Kun Wang
and Lei He",
title = "Low-precision Floating-point Arithmetic for
High-performance {FPGA}-based {CNN} Acceleration",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "6:1--6:21",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474597",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3474597",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2022:NTE,
author = "Deming Chen",
title = "Note from the {TRETS EiC} about the new Journal-first
track in {FPT'21}",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "7e:1--7e:1",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501280",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3501280",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7e",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Streit:2022:DET,
author = "Franz-Josef Streit and Paul Kr{\"u}ger and Andreas
Becher and Stefan Wildermann and J{\"u}rgen Teich",
title = "Design and Evaluation of a Tunable {PUF} Architecture
for {FPGAs}",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "7:1--7:27",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491237",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3491237",
abstract = "FPGA-based Physical Unclonable Functions (PUF) have
emerged as a viable alternative to permanent key
storage by turning effects of inaccuracies during the
manufacturing process of a chip into a unique,
FPGA-intrinsic secret. However, many fixed PUF
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhou:2022:ROS,
author = "Yun Zhou and Pongstorn Maidee and Chris Lavin and
Alireza Kaviani and Dirk Stroobandt",
title = "{RWRoute}: an Open-source Timing-driven Router for
Commercial {FPGAs}",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "8:1--8:27",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491236",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3491236",
abstract = "One of the key obstacles to pervasive deployment of
FPGA accelerators in data centers is their cumbersome
programming model. Open source tooling is suggested as
a way to develop alternative EDA tools to remedy this
issue. Open source FPGA CAD tools have \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Rasoulinezhad:2022:REB,
author = "Seyedramin Rasoulinezhad and Esther Roorda and Steve
Wilton and Philip H. W. Leong and David Boland",
title = "Rethinking Embedded Blocks for Machine Learning
Applications",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "9:1--9:30",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491234",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3491234",
abstract = "The underlying goal of FPGA architecture research is
to devise flexible substrates that implement a wide
variety of circuits efficiently. Contemporary FPGA
architectures have been optimized to support
networking, signal processing, and image processing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Menzel:2022:SSA,
author = "Johannes Menzel and Christian Plessl and Tobias
Kenter",
title = "The Strong Scaling Advantage of {FPGAs} in {HPC} for
{$N$}-body Simulations",
journal = j-TRETS,
volume = "15",
number = "1",
pages = "10:1--10:30",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491235",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Jan 28 07:03:50 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3491235",
abstract = "N-body methods are one of the essential algorithmic
building blocks of high-performance and parallel
computing. Previous research has shown promising
performance for implementing n-body simulations with
pairwise force calculations on FPGAs. However, to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Eguro:2022:ISIa,
author = "Ken Eguro and Stephen Neuendorffer and Viktor Prasanna
and Hongbo Rong",
title = "Introduction to Special Issue on {FPGAs} in Data
Centers",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "11:1--11:2",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3493607",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3493607",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Keller:2022:ITR,
author = "Andrew M. Keller and Michael J. Wirthlin",
title = "The Impact of Terrestrial Radiation on {FPGAs} in Data
Centers",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "12:1--12:21",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457198",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3457198",
abstract = "Field programmable gate arrays (FPGAs) are used in
large numbers in data centers around the world. They
are used for cloud computing and computer networking.
The most common type of FPGA used in data centers are
re-programmable SRAM-based FPGAs. These \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Asiatici:2022:RCS,
author = "Mikhail Asiatici and Paolo Ienne",
title = "Request, Coalesce, Serve, and Forget: Miss-Optimized
Memory Systems for Bandwidth-Bound Cache-Unfriendly
Applications on {FPGAs}",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "13:1--13:33",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3466823",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3466823",
abstract = "Applications such as large-scale sparse linear algebra
and graph analytics are challenging to accelerate on
FPGAs due to the short irregular memory accesses,
resulting in low cache hit rates. Nonblocking caches
reduce the bandwidth required by misses by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dogan:2022:CBB,
author = "Atakan Dogan and Kemal Ebcioglu",
title = "Cloud Building Block Chip for Creating {FPGA} and
{ASIC} Clouds",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "14:1--14:35",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3466822",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3466822",
abstract = "Hardware-accelerated cloud computing systems based on
FPGA chips (FPGA cloud) or ASIC chips (ASIC cloud) have
emerged as a new technology trend for power-efficient
acceleration of various software applications. However,
the operating systems and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Alonso:2022:EDS,
author = "Tobias Alonso and Lucian Petrica and Mario Ruiz and
Jakoba Petri-Koenig and Yaman Umuroglu and Ioannis
Stamelos and Elias Koromilas and Michaela Blott and
Kees Vissers",
title = "{Elastic-DF}: Scaling Performance of {DNN} Inference
in {FPGA} Clouds through Automatic Partitioning",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "15:1--15:34",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3470567",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3470567",
abstract = "Customized compute acceleration in the datacenter is
key to the wider roll-out of applications based on deep
neural network (DNN) inference. In this article, we
investigate how to maximize the performance and
scalability of field-programmable gate array \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Salamat:2022:NGN,
author = "Sahand Salamat and Hui Zhang and Yang Seok Ki and
Tajana Rosing",
title = "\pkg{NASCENT2}: Generic Near-Storage Sort Accelerator
for Data Analytics on {SmartSSD}",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "16:1--16:29",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472769",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3472769",
abstract = "As the size of data generated every day grows
dramatically, the computational bottleneck of computer
systems has shifted toward storage devices. The
interface between the storage and the computational
platforms has become the main limitation due to its
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Damiani:2022:BFS,
author = "Andrea Damiani and Giorgia Fiscaletti and Marco Bacis
and Rolando Brondolin and Marco D. Santambrogio",
title = "\pkg{BlastFunction}: a Full-stack Framework Bringing
{FPGA} Hardware Acceleration to Cloud-native
Applications",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "17:1--17:27",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472958",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3472958",
abstract = "``Cloud-native'' is the umbrella adjective describing
the standard approach for developing applications that
exploit cloud infrastructures' scalability and
elasticity at their best. As the application complexity
and user-bases grow, designing for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{DAlberto:2022:XID,
author = "Paolo D'Alberto and Victor Wu and Aaron Ng and Rahul
Nimaiyar and Elliott Delaye and Ashish Sirasao",
title = "\pkg{xDNN}: Inference for Deep Convolutional Neural
Networks",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "18:1--18:29",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473334",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3473334",
abstract = "We present xDNN, an end-to-end system for
deep-learning inference based on a family of
specialized hardware processors synthesized on
Field-Programmable Gate Array (FPGAs) and Convolution
Neural Networks (CNN). We present a design optimized
for low \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mbongue:2022:DMT,
author = "Joel Mandebi Mbongue and Danielle Tchuinkou Kwadjo and
Alex Shuping and Christophe Bobda",
title = "Deploying Multi-tenant {FPGAs} within {Linux}-based
Cloud Infrastructure",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "19:1--19:31",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474058",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/linux.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib;
https://www.math.utah.edu/pub/tex/bib/unix.bib",
URL = "https://dl.acm.org/doi/10.1145/3474058",
abstract = "Cloud deployments now increasingly exploit
Field-Programmable Gate Array (FPGA) accelerators as
part of virtual instances. While cloud FPGAs are still
essentially single-tenant, the growing demand for
efficient hardware acceleration paves the way to FPGA
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hogervorst:2022:HAH,
author = "Tom Hogervorst and Razvan Nane and Giacomo Marchiori
and Tong Dong Qiu and Markus Blatt and Alf Birger
Rustad",
title = "Hardware Acceleration of High-Performance
Computational Flow Dynamics Using High-Bandwidth
Memory-Enabled Field-Programmable Gate Arrays",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "20:1--20:35",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3476229",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3476229",
abstract = "Scientific computing is at the core of many
High-Performance Computing applications, including
computational flow dynamics. Because of the utmost
importance to simulate increasingly larger
computational models, hardware acceleration is
receiving increased \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sun:2022:BEC,
author = "Gongjin Sun and Seongyoung Kang and Sang-Woo Jun",
title = "\pkg{BurstZ+}: Eliminating The Communication
Bottleneck of Scientific Computing Accelerators via
Accelerated Compression",
journal = j-TRETS,
volume = "15",
number = "2",
pages = "21:1--21:34",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3476831",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 2 08:59:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3476831",
abstract = "We present BurstZ+, an accelerator platform that
eliminates the communication bottleneck between
PCIe-attached scientific computing accelerators and
their host servers, via hardware-optimized compression.
While accelerators such as GPUs and FPGAs provide
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Eguro:2022:ISIb,
author = "Ken Eguro and Stephen Neuendorffer and Viktor Prasanna
and Hongbo Rong",
title = "Introduction to Special Issue on {FPGAs} in Data
Centers, {Part II}",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "22:1--22:2",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3495231",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3495231",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tarafdar:2022:AOF,
author = "Naif Tarafdar and Giuseppe {Di Guglielmo} and Philip
C. Harris and Jeffrey D. Krupa and Vladimir Loncar and
Dylan S. Rankin and Nhan Tran and Zhenbin Wu and
Qianfeng Shen and Paul Chow",
title = "{AIgean}: an Open Framework for Deploying Machine
Learning on Heterogeneous Clusters",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "23:1--23:32",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3482854",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3482854",
abstract = "AIgean, pronounced like the sea, is an open framework
to build and deploy machine learning (ML) algorithms on
a heterogeneous cluster of devices (CPUs and FPGAs). We
leverage two open source projects: Galapagos, for
multi-FPGA deployment, and hls4ml, for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zeng:2022:UFV,
author = "Shulin Zeng and Guohao Dai and Hanbo Sun and Jun Liu
and Shiyao Li and Guangjun Ge and Kai Zhong and Kaiyuan
Guo and Yu Wang and Huazhong Yang",
title = "A Unified {FPGA} Virtualization Framework for
General-Purpose Deep Neural Networks in the Cloud",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "24:1--24:31",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3480170",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
URL = "https://dl.acm.org/doi/10.1145/3480170",
abstract = "INFerence-as-a-Service (INFaaS) has become a primary
workload in the cloud. However, existing FPGA-based
Deep Neural Network (DNN) accelerators are mainly
optimized for the fastest speed of a single task, while
the multi-tenancy of INFaaS has not been explored yet.
As the demand for INFaaS keeps growing, simply
increasing the number of FPGA-based DNN accelerators is
not cost-effective, while merely sharing these
single-task optimized DNN accelerators in a
time-division multiplexing way could lead to poor
isolation and high-performance loss for INFaaS. On the
other hand, current cloud-based DNN accelerators have
excessive compilation overhead, especially when scaling
out to multi-FPGA systems for multi-tenant sharing,
leading to unacceptable compilation costs for both
offline deployment and online reconfiguration.
Therefore, it is far from providing efficient and
flexible FPGA virtualization for public and private
cloud scenarios.\par
Aiming to solve these problems, we propose a unified
virtualization framework for general-purpose deep
neural networks in the cloud, enabling multi-tenant
sharing for both the Convolution Neural Network (CNN),
and the Recurrent Neural Network (RNN) accelerators on
a single FPGA. The isolation is enabled by introducing
a two-level instruction dispatch module and a
multi-core based hardware resources pool. Such designs
provide isolated and runtime-programmable hardware
resources, which further leads to performance isolation
for multi-tenant sharing. On the other hand, to
overcome the heavy re-compilation overheads, a
tiling-based instruction frame package design and a
two-stage static-dynamic compilation, are proposed.
Only the lightweight runtime information is re-compiled
with $ \approx $1 ms overhead, thus guaranteeing the
private cloud's performance. Finally, the extensive
experimental results show that the proposed virtualized
solutions achieve up to $ 3.12 \times $ and $ 6.18
\times $ higher throughput in the private cloud
compared with the static CNN and RNN baseline designs,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Alachiotis:2022:SPR,
author = "Nikolaos Alachiotis and Panagiotis Skrimponis and
Manolis Pissadakis and Dionisios Pnevmatikatos",
title = "Scalable Phylogeny Reconstruction with Disaggregated
Near-memory Processing",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "25:1--25:32",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3484983",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3484983",
abstract = "Disaggregated computer architectures eliminate
resource fragmentation in next-generation datacenters
by enabling virtual machines to employ resources such
as CPUs, memory, and accelerators that are physically
located on different servers. While this paves
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Brennsteiner:2022:RTD,
author = "Stefan Brennsteiner and Tughrul Arslan and John
Thompson and Andrew McCormick",
title = "A Real-Time Deep Learning {OFDM} Receiver",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "26:1--26:25",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3494049",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3494049",
abstract = "Machine learning in the physical layer of
communication systems holds the potential to improve
performance and simplify design methodology. Many
algorithms have been proposed; however, the model
complexity is often unfeasible for real-time
deployment. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lienen:2022:DDR,
author = "Christian Lienen and Marco Platzner",
title = "Design of Distributed Reconfigurable Robotics Systems
with {ReconROS}",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "27:1--27:20",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3494571",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3494571",
abstract = "Robotics applications process large amounts of data in
real time and require compute platforms that provide
high performance and energy efficiency. FPGAs are well
suited for many of these applications, but there is a
reluctance in the robotics community \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cahill:2022:AFD,
author = "Eli Cahill and Brad Hutchings and Jeffrey Goeders",
title = "Approaches for {FPGA} Design Assurance",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "28:1--28:29",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491233",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3491233",
abstract = "Field-Programmable Gate Arrays (FPGAs) are widely used
for custom hardware implementations, including in many
security-sensitive industries, such as defense,
communications, transportation, medical, and more.
Compiling source hardware descriptions to FPGA
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "28",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Faraji:2022:ACC,
author = "S. Rasoul Faraji and Pierre Abillama and Kia
Bazargan",
title = "Approximate Constant-Coefficient Multiplication Using
Hybrid Binary-Unary Computing for {FPGAs}",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "29:1--29:25",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3494570",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3494570",
abstract = "Multipliers are used in virtually all Digital Signal
Processing (DSP) applications such as image and video
processing. Multiplier efficiency has a direct impact
on the overall performance of such applications,
especially when real-time processing is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "29",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Du:2022:BAB,
author = "Gaoming Du and Bangyi Chen and Zhenmin Li and Zhenxing
Tu and Junjie Zhou and Shenya Wang and Qinghao Zhao and
Yongsheng Yin and Xiaolei Wang",
title = "A {BNN} Accelerator Based on Edge-skip-calculation
Strategy and Consolidation Compressed Tree",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "30:1--30:20",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3494569",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3494569",
abstract = "Binarized neural networks (BNNs) and batch
normalization (BN) have already become typical
techniques in artificial intelligence today.
Unfortunately, the massive accumulation and
multiplication in BNN models bring challenges to
field-programmable gate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "30",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dewald:2022:ILP,
author = "Florian Dewald and Johanna Rohde and Christian
Hochberger and Heiko Mantel",
title = "Improving Loop Parallelization by a Combination of
Static and Dynamic Analyses in {HLS}",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "31:1--31:31",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501801",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3501801",
abstract = "High-level synthesis (HLS) can be used to create
hardware accelerators for compute-intense software
parts such as loop structures. Usually, this process
requires significant amount of user interaction to
steer kernel selection and optimizations. This can
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "31",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Matthews:2022:QDR,
author = "Eric Matthews and Alec Lu and Zhenman Fang and Lesley
Shannon",
title = "{Quick-Div}: Rethinking Integer Divider Design for
{FPGA}-based Soft-processors",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "32:1--32:27",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3502492",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3502492",
abstract = "In today's FPGA-based soft-processors, one of the
slowest instructions is integer division. Compared to
the low single-digit latency of other arithmetic
operations, the fixed 32-cycle latency of radix-2
division is substantially longer. Given that today's
soft-processors typically only implement radix-2
division --- if they support hardware division at all
--- there is significant potential to improve the
performance of integer dividers.\par
In this work, we present a set of high-performance,
data-dependent, variable-latency integer dividers for
FPGA-based soft-processors that we call Quick-Div. We
compare them to various radix-N dividers and provide a
thorough analysis in terms of latency and resource
usage. In addition, we analyze the frequency scaling
for such divider designs when (1) treated as a
stand-alone unit and (2) integrated as part of a
high-performance soft-processor. Moreover, we provide
additional theoretical analysis of different dividers'
behaviour and develop a new better-performing Quick-Div
variant, called Quick-radix-4. Experimental results
show that our Quick-radix-4 design can achieve up to $
6.8 \times $ better performance and $ 6.1 \times $
better performance-per-LUT over the radix-2 divider for
applications such as random number generation. Even in
cases where division operations constitute as little as
1\% of all executed instructions, Quick-radix-4
provides a performance uplift of 16\% compared to the
radix-2 divider.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "32",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Roorda:2022:FAE,
author = "Esther Roorda and Seyedramin Rasoulinezhad and Philip
H. W. Leong and Steven J. E. Wilton",
title = "{FPGA} Architecture Exploration for {DNN}
Acceleration",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "33:1--33:37",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3503465",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3503465",
abstract = "Recent years have seen an explosion of machine
learning applications implemented on Field-Programmable
Gate Arrays (FPGAs). FPGA vendors and researchers have
responded by updating their fabrics to more efficiently
implement machine learning accelerators, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "33",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bobda:2022:FFA,
author = "Christophe Bobda and Joel Mandebi Mbongue and Paul
Chow and Mohammad Ewais and Naif Tarafdar and Juan
Camilo Vega and Ken Eguro and Dirk Koch and Suranga
Handagala and Miriam Leeser and Martin Herbordt and
Hafsah Shahzad and Peter Hofste and Burkhard Ringlein
and Jakub Szefer and Ahmed Sanaullah and Russell
Tessier",
title = "The Future of {FPGA} Acceleration in Datacenters and
the Cloud",
journal = j-TRETS,
volume = "15",
number = "3",
pages = "34:1--34:42",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506713",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue May 24 07:29:32 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3506713",
abstract = "In this article, we survey existing academic and
commercial efforts to provide Field-Programmable Gate
Array (FPGA) acceleration in datacenters and the cloud.
The goal is a critical review of existing systems and a
discussion of their evolution from \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "34",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mentens:2022:ISS,
author = "Nele Mentens and Lionel Sousa and Pedro Trancoso",
title = "Introduction to the Special Section on {FPL 2020}",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "35:1--35:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3536336",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3536336",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "35",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shi:2022:EHF,
author = "Runbin Shi and Kaan Kara and Christoph Hagleitner and
Dionysios Diamantopoulos and Dimitris Syrivelis and
Gustavo Alonso",
title = "Exploiting {HBM} on {FPGAs} for Data Processing",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491238",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3491238",
abstract = "Field Programmable Gate Arrays (FPGAs) are
increasingly being used in data centers and the cloud
due to their potential to accelerate certain workloads
as well as for their architectural flexibility, since
they can be used as accelerators, smart-NICs, or
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "36",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nikolic:2022:DPD,
author = "Stefan Nikoli{\'c} and Grace Zgheib and Paolo Ienne",
title = "Detailed Placement for Dedicated {LUT}-Level {FPGA}
Interconnect",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "37:1--37:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501802",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3501802",
abstract = "In this work, we develop timing-driven CAD support for
FPGA architectures with direct connections between
LUTs. We do so by proposing an efficient ILP-based
detailed placer, which moves a carefully selected
subset of LUTs from their original positions, so
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "37",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhang:2022:RFH,
author = "Niansong Zhang and Xiang Chen and Nachiket Kapre",
title = "{RapidLayout}: Fast Hard Block Placement of
{FPGA}-optimized Systolic Arrays Using Evolutionary
Algorithm",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "38:1--38:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501803",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3501803",
abstract = "Evolutionary algorithms can outperform conventional
placement algorithms such as simulated annealing,
analytical placement, and manual placement on runtime,
wirelength, pipelining cost, and clock frequency when
mapping hard block intensive designs such as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "38",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Singh:2022:AWP,
author = "Gagandeep Singh and Dionysios Diamantopoulos and Juan
G{\'o}mez-Luna and Christoph Hagleitner and Sander
Stuijk and Henk Corporaal and Onur Mutlu",
title = "Accelerating Weather Prediction Using Near-Memory
Reconfigurable Fabric",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501804",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3501804",
abstract = "Ongoing climate change calls for fast and accurate
weather and climate modeling. However, when solving
large-scale weather prediction simulations,
state-of-the-art CPU and GPU implementations suffer
from limited performance and high energy consumption.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "39",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Malik:2022:HEA,
author = "Gurshaant Malik and Ian Elmore Lang and Rodolfo
Pellizzoni and Nachiket Kapre",
title = "{HopliteML}: Evolving Application Customized {FPGA
NoCs} with Adaptable Routers and Regulators",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "40:1--40:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3507699",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3507699",
abstract = "We can overcome the pessimism in worst-case routing
latency analysis of timing-predictable Network-on-Chip
(NoC) workloads by single-digit factors through the use
of a hybrid field-programmable gate array
(FPGA)-optimized NoC and workload-adapted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "40",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cook:2022:INU,
author = "Hayden Cook and Jacob Arscott and Brent George and
Tanner Gaskin and Jeffrey Goeders and Brad Hutchings",
title = "Inducing Non-uniform {FPGA} Aging Using
Configuration-based Short Circuits",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "41:1--41:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517042",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3517042",
abstract = "This work demonstrates a novel method of accelerating
FPGA aging by configuring FPGAs to implement thousands
of short circuits, resulting in high on-chip currents
and temperatures. Patterns of ring oscillators are
placed across the chip and are used to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "41",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Leong:2022:ISS,
author = "Philip H. W. Leong",
title = "Introduction to Special Section on {FPGA} 2021",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "42:1--42:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3536335",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3536335",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "42",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lu:2022:DSH,
author = "Alec Lu and Zhenman Fang and Lesley Shannon",
title = "Demystifying the Soft and Hardened Memory Systems of
Modern {FPGAs} for Software Programmers through
Microbenchmarking",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "43:1--43:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517131",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3517131",
abstract = "Both modern datacenter and embedded Field Programmable
Gate Arrays (FPGAs) provide great opportunities for
high-performance and high-energy-efficiency computing.
With the growing public availability of FPGAs from
major cloud service providers such as AWS, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "43",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2022:TRE,
author = "Xinyu Chen and Feng Cheng and Hongshi Tan and Yao Chen
and Bingsheng He and Weng-Fai Wong and Deming Chen",
title = "{ThunderGP}: Resource-Efficient Graph Processing
Framework on {FPGAs} with {HLS}",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "44:1--44:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517141",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3517141",
abstract = "FPGA has been an emerging computing infrastructure in
datacenters benefiting from fine-grained parallelism,
energy efficiency, and reconfigurability. Meanwhile,
graph processing has attracted tremendous interest in
data analytics, and its performance is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "44",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Langhammer:2022:SNA,
author = "Martin Langhammer and Eriko Nurvitadhi and Sergey
Gribok and Bogdan Pasca",
title = "{Stratix 10 NX} Architecture",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "45:1--45:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3520197",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3520197",
abstract = "The advent of AI has driven the exploration of
high-density low-precision arithmetic on FPGAs. This
has resulted in new methods in mapping both arithmetic
functions as well as dataflows onto the fabric, as well
as some changes to the embedded DSP Blocks. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "45",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Arora:2022:TSF,
author = "Aman Arora and Moinak Ghosh and Samidh Mehta and
Vaughn Betz and Lizy K. John",
title = "Tensor Slices: {FPGA} Building Blocks For the {Deep
Learning} Era",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "46:1--46:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3529650",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3529650",
abstract = "FPGAs are well-suited for accelerating deep learning
(DL) applications owing to the rapidly changing
algorithms, network architectures and computation
requirements in this field. However, the generic
building blocks available on traditional FPGAs limit
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "46",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ebcioglu:2022:HPM,
author = "Kemal Ebcioglu and Ismail San",
title = "Highly Parallel Multi-{FPGA} System Compilation from
Sequential {C\slash C++} Code in the {AWS} Cloud",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3507698",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3507698",
abstract = "We present a High Level Synthesis compiler that
automatically obtains a multi-chip accelerator system
from a single-threaded sequential C/C++ application.
Invoking the multi-chip accelerator is functionally
identical to invoking the single-threaded \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "47",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Panchapakesan:2022:SEA,
author = "Sathish Panchapakesan and Zhenman Fang and Jian Li",
title = "{SyncNN}: Evaluating and Accelerating Spiking Neural
Networks on {FPGAs}",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514253",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3514253",
abstract = "Compared to conventional artificial neural networks,
spiking neural networks (SNNs) are more biologically
plausible and require less computation due to their
event-driven nature of spiking neurons. However, the
default asynchronous execution of SNNs also \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "48",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gibson:2022:ACM,
author = "Kahlan Gibson and Esther Roorda and Daniel Holanda
Noronha and Steven J. E. Wilton",
title = "Adaptive Clock Management of {HLS}-generated Circuits
on {FPGAs}",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3520140",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3520140",
abstract = "In this article, we present Syncopation, a
performance-boosting fine-grained timing analysis and
adaptive clock management technique for High-Level
Synthesis-generated circuits implemented on
Field-Programmable Gate Arrays. The key idea is to use
the HLS \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "49",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sherwin:2022:MFF,
author = "Krystine Dawn Sherwin and Kevin I-Kai Wang and Prabu
Thiagaraj and Ben Stappers and Oliver Sinnen",
title = "Median Filters on {FPGAs} for Infinite Data and Large,
Rectangular Windows",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3530273",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3530273",
abstract = "Efficient architectures and implementations of median
filters have been well investigated in the past. In
this article, we focus on median filters for very big
scientific applications with very large windows and an
infinite stream of data, inspired by big \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "50",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cong:2022:FHT,
author = "Jason Cong and Jason Lau and Gai Liu and Stephen
Neuendorffer and Peichen Pan and Kees Vissers and Zhiru
Zhang",
title = "{FPGA HLS} Today: Successes, Challenges, and
Opportunities",
journal = j-TRETS,
volume = "15",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3530775",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:16 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3530775",
abstract = "The year 2011 marked an important transition for FPGA
high-level synthesis (HLS), as it went from prototyping
to deployment. A decade later, in this article, we
assess the progress of the deployment of HLS technology
and highlight the successes in several \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "51",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sinnen:2023:ISS,
author = "Oliver Sinnen and Qiang Liu and Azadeh Davoodi",
title = "Introduction to Special Section on {FPT'20}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579850",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3579850",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shi:2023:OGR,
author = "Kaichuang Shi and Xuegong Zhou and Hao Zhou and Lingli
Wang",
title = "An Optimized {GIB} Routing Architecture with Bent
Wires for {FPGA}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3519599",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3519599",
abstract = "Field-programmable gate arrays (FGPAs) are widely used
because of the superiority in flexibility and lower
non-recurring engineering cost. How to optimize the
routing architecture is a key problem for FPGA
architects because it has a large impact on FPGA
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2023:JBA,
author = "Xiang Li and Peter Stanwicks and George Provelengios
and Russell Tessier and Daniel Holcomb",
title = "Jitter-based Adaptive True Random Number Generation
Circuits for {FPGAs} in the Cloud",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3487554",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3487554",
abstract = "In this article, we present and evaluate a true random
number generator (TRNG) design that is compatible with
the restrictions imposed by cloud-based Field
Programmable Gate Array (FPGA) providers such as Amazon
Web Services (AWS) EC2 F1. Because cloud \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Que:2023:RRM,
author = "Zhiqiang Que and Hiroki Nakahara and Hongxiang Fan and
He Li and Jiuxi Meng and Kuen Hung Tsoi and Xinyu Niu
and Eriko Nurvitadhi and Wayne Luk",
title = "{Remarn}: a Reconfigurable Multi-threaded Multi-core
Accelerator for Recurrent Neural Networks",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3534969",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3534969",
abstract = "This work introduces Remarn, a reconfigurable
multi-threaded multi-core accelerator supporting both
spatial and temporal co-execution of Recurrent Neural
Network (RNN) inferences. It increases processing
capabilities and quality of service of cloud-based
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Schelten:2023:HTR,
author = "Niklas Schelten and Fritjof Steinert and Justin
Knapheide and Anton Schulte and Benno Stabernack",
title = "A High-Throughput, Resource-Efficient Implementation
of the {RoCEv2} Remote {DMA} Protocol and its
Application",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3543176",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3543176",
abstract = "The use of application-specific accelerators in data
centers has been the state of the art for at least a
decade, starting with the availability of General
Purpose GPUs achieving higher performance either
overall or per watt. In most cases, these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Giechaskiel:2023:CVC,
author = "Ilias Giechaskiel and Shanquan Tian and Jakub Szefer",
title = "{Cross-VM} Covert- and Side-Channel Attacks in Cloud
{FPGAs}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3534972",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3534972",
abstract = "The availability of FPGAs in cloud data centers offers
rapid, on-demand access to reconfigurable hardware
compute resources that users can adapt to their own
needs. However, the low-level access to the FPGA
hardware and associated resources such as the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wolf:2023:ASE,
author = "Dennis Leander Wolf and Christoph Spang and Daniel
Diener and Christian Hochberger",
title = "Advantages of a Statistical Estimation Approach for
Clock Frequency Estimation of Heterogeneous and
Irregular {CGRAs}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531062",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3531062",
abstract = "Estimating the maximum clock frequency of homogeneous
Coarse Grained Reconfigurable Arrays/Architectures
(CGRAs) with an arbitrary number of Processing Elements
(PE) is difficult. Clock frequency estimation of highly
heterogeneous CGRAs takes additional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ioannou:2023:SOA,
author = "Lenos Ioannou and Suhaib A. Fahmy",
title = "Streaming Overlay Architecture for Lightweight {LSTM}
Computation on {FPGA SoCs}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "8:1--8:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3543069",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3543069",
abstract = "Long-Short Term Memory (LSTM) networks, and Recurrent
Neural Networks (RNNs) in general, have demonstrated
their suitability in many time series data
applications, especially in Natural Language Processing
(NLP). Computationally, LSTMs introduce \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2023:SSA,
author = "Xiangwei Li and Douglas L. Maskell and Carol Jingyi Li
and Philip H. W. Leong and David Boland",
title = "A Scalable Systolic Accelerator for Estimation of the
Spectral Correlation Density Function and Its {FPGA}
Implementation",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "9:1--9:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546181",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3546181",
abstract = "The spectral correlation density (SCD) function is the
time-averaged correlation of two spectral components
used for analyzing periodic signals with time-varying
spectral content. Although the analysis is extremely
powerful, it has not been widely adopted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tao:2023:LGL,
author = "Zhuofu Tao and Chen Wu and Yuan Liang and Kun Wang and
Lei He",
title = "{LW-GCN}: a Lightweight {FPGA}-based Graph
Convolutional Network Accelerator",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "10:1--10:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550075",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3550075",
abstract = "Graph convolutional networks (GCNs) have been
introduced to effectively process non-Euclidean graph
data. However, GCNs incur large amounts of irregularity
in computation and memory access, which prevents
efficient use of traditional neural network \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Moini:2023:VSI,
author = "Shayan Moini and Aleksa Deric and Xiang Li and George
Provelengios and Wayne Burleson and Russell Tessier and
Daniel Holcomb",
title = "Voltage Sensor Implementations for Remote Power
Attacks on {FPGAs}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "11:1--11:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3555048",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3555048",
abstract = "This article presents a study of two types of on-chip
FPGA voltage sensors based on ring oscillators (ROs)
and time-to-digital converter (TDCs), respectively. It
has previously been shown that these sensors are often
used to extract side-channel \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kalantar:2023:FBA,
author = "Amin Kalantar and Zachary Zimmerman and Philip Brisk",
title = "{FPGA}-based Acceleration of Time Series Similarity
Prediction: From Cloud to Edge",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "12:1--12:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3555810",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3555810",
abstract = "With the proliferation of low-cost sensors and the
Internet of Things, the rate of producing data far
exceeds the compute and storage capabilities of today's
infrastructure. Much of this data takes the form of
time series, and in response, there has been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Vestias:2023:EDL,
author = "M{\'a}rio V{\'e}stias and Rui P. Duarte and Jos{\'e}
T. de Sousa and Hor{\'a}cio Neto",
title = "Efficient Design of Low Bitwidth Convolutional Neural
Networks on {FPGA} with Optimized Dot Product Units",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "13:1--13:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546182",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3546182",
abstract = "Designing hardware accelerators to run the inference
of convolutional neural networks (CNN) is under
intensive research. Several different architectures
have been proposed along with hardware-oriented
optimizations of the neural network models. One of the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{deMoura:2023:DCR,
author = "Rafael F{\~a}o de Moura and Joao Paulo Cardoso de Lima
and Luigi Carro",
title = "Data and Computation Reuse in {CNNs} Using Memristor
{TCAMs}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "14:1--14:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3549536",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3549536",
abstract = "Exploiting computational and data reuse in CNNs is
crucial for the successful design of
resource-constrained platforms. In image recognition
applications, high levels of input locality and
redundancy present in CNNs have become the golden goose
for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Abdelhamid:2023:SMC,
author = "Riadh {Ben Abdelhamid} and Yoshiki Yamaguchi and
Taisuke Boku",
title = "A Scalable Many-core Overlay Architecture on an
{HBM2}-enabled Multi-Die {FPGA}",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "15:1--15:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3547657",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3547657",
abstract = "The overlay architecture enables to raise the
abstraction level of hardware design and enhances
hardware-accelerated applications' portability. In
FPGAs, there is a growing awareness of the overlay
structure as typified by many-core architecture. It
works \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Iskandar:2023:NMC,
author = "Veronia Iskandar and Mohamed A. {Abd El Ghany} and
Diana G{\"o}hringer",
title = "Near-memory Computing on {FPGAs} with {$3$D}-stacked
Memories: Applications, Architectures, and
Optimizations",
journal = j-TRETS,
volume = "16",
number = "1",
pages = "16:1--16:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3547658",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Mar 11 08:27:18 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3547658",
abstract = "The near-memory computing (NMC) paradigm has
transpired as a promising method for overcoming the
memory wall challenges of future computing
architectures. Modern systems integrating 3D-stacked
DRAM memory can be leveraged to prevent unnecessary
data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shahsavani:2023:ECM,
author = "Soheil Nazar Shahsavani and Arash Fayyazi and Mahdi
Nazemi and Massoud Pedram",
title = "Efficient Compilation and Mapping of Fixed Function
Combinational Logic onto Digital Signal Processors
Targeting Neural Network Inference and Utilizing
High-level Synthesis",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "17:1--17:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3559543",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3559543",
abstract = "Recent efforts for improving the performance of neural
network (NN) accelerators that meet today's application
requirements have given rise to a new trend of
logic-based NN inference relying on fixed function
combinational logic. Mapping such large \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Choi:2023:FAP,
author = "Young-Kyu Choi and Carlos Santillana and Yujia Shen
and Adnan Darwiche and Jason Cong",
title = "{FPGA} Acceleration of Probabilistic Sentential
Decision Diagrams with High-level Synthesis",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "18:1--18:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561514",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3561514",
abstract = "Probabilistic Sentential Decision Diagrams (PSDDs)
provide efficient methods for modeling and reasoning
with probability distributions in the presence of
massive logical constraints. PSDDs can also be
synthesized from graphical models such as Bayesian
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ganewattha:2023:HAR,
author = "Chanaka Ganewattha and Zaheer Khan and Janne
Lehtom{\"a}ki and Matti Latva-Aho",
title = "Hardware-accelerated Real-time Drift-awareness for
Robust Deep Learning on Wireless {RF} Data",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "19:1--19:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563394",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3563394",
abstract = "Proactive and intelligent management of network
resource utilization (RU) using deep learning (DL) can
significantly improve the efficiency and performance of
the next generation of wireless networks. However,
variations in wireless RU are often affected \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Proulx:2023:SFC,
author = "Alexandre Proulx and Jean-Yves Chouinard and Paul
Fortier and Amine Miled",
title = "A Survey on {FPGA} Cybersecurity Design Strategies",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "20:1--20:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561515",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3561515",
abstract = "This article presents a critical literature review on
the security aspects of field-programmable gate array
(FPGA) devices. FPGA devices present unique challenges
to cybersecurity through their reconfigurable nature.
The article also pays special \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Soldavini:2023:ACH,
author = "Stephanie Soldavini and Karl Friebel and Mattia
Tibaldi and Gerald Hempel and Jeronimo Castrillon and
Christian Pilato",
title = "Automatic Creation of High-bandwidth Memory
Architectures from Domain-specific Languages: The Case
of Computational Fluid Dynamics",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563553",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3563553",
abstract = "Numerical simulations can help solve complex problems.
Most of these algorithms are massively parallel and
thus good candidates for FPGA acceleration thanks to
spatial parallelism. Modern FPGA devices can leverage
high-bandwidth memory technologies, but \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yang:2023:HOF,
author = "Gangqiang Yang and Zhengyuan Shi and Cheng Chen and
Hailiang Xiong and Fudong Li and Honggang Hu and Zhiguo
Wan",
title = "Hardware Optimizations of {Fruit-80} Stream Cipher:
Smaller than Grain",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569455",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3569455",
abstract = "Fruit-80, which emerged as an ultra-lightweight stream
cipher with 80-bit secret key, is oriented toward
resource-constrained devices in the Internet of Things.
In this article, we propose area and speed optimization
architectures of Fruit-80 on FPGAs. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Basalama:2023:FEE,
author = "Suhail Basalama and Atefeh Sohrabizadeh and Jie Wang
and Licheng Guo and Jason Cong",
title = "{FlexCNN}: an End-to-end Framework for Composing {CNN}
Accelerators on {FPGA}",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "23:1--23:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570928",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3570928",
abstract = "With reduced data reuse and parallelism, recent
convolutional neural networks (CNNs) create new
challenges for FPGA acceleration. Systolic arrays (SAs)
are efficient, scalable architectures for convolutional
layers, but without proper optimizations, their
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Meyer:2023:MFD,
author = "Marius Meyer and Tobias Kenter and Christian Plessl",
title = "{Multi-FPGA} Designs and Scaling of {HPC} Challenge
Benchmarks via {MPI} and Circuit-switched Inter-{FPGA}
Networks",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "24:1--24:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3576200",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3576200",
abstract = "While FPGA accelerator boards and their respective
high-level design tools are maturing, there is still a
lack of multi-FPGA applications, libraries, and not
least, benchmarks and reference implementations towards
sustained HPC usage of these devices. As \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ueno:2023:VVC,
author = "Tomohiro Ueno and Kentaro Sano",
title = "{VCSN}: Virtual Circuit-Switching Network for Flexible
and Simple-to-Operate Communication in {HPC FPGA}
Cluster",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "25:1--25:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579848",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3579848",
abstract = "FPGA clusters promise to play a critical role in
high-performance computing (HPC) systems in the near
future due to their flexibility and high power
efficiency. The operation of large-scale
general-purpose FPGA clusters on which multiple users
run diverse \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nayak:2023:IEE,
author = "Ankita Nayak and Keyi Zhang and Rajsekhar Setaluri and
Alex Carsello and Makai Mann and Christopher Torng and
Stephen Richardson and Rick Bahr and Pat Hanrahan and
Mark Horowitz and Priyanka Raina",
title = "Improving Energy Efficiency of {CGRAs} with
Low-Overhead Fine-Grained Power Domains",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "26:1--26:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3558394",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3558394",
abstract = "To effectively minimize static power for a wide range
of applications, power domains for coarse-grained
reconfigurable array (CGRA) architectures need to be
more fine-grained than those found in a typical
application-specific integrated circuit. However,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhao:2023:ASC,
author = "Kang Zhao and Yuchun Ma and Ruining He and Jixing
Zhang and Ning Xu and Jinian Bian",
title = "Adaptive Selection and Clustering of Partial
Reconfiguration Modules for Modern {FPGA} Design Flow",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "27:1--27:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567427",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3567427",
abstract = "Dynamic Partially Reconfiguration (DPR) on FPGA has
attracted significant research interest in recent years
since it provides benefits such as reduced area and
flexible functionality. However, due to the lack of
supporting synthesis tools in the current \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Tian:2023:SSA,
author = "Xingyu Tian and Zhifan Ye and Alec Lu and Licheng Guo
and Yuze Chi and Zhenman Fang",
title = "{SASA}: a Scalable and Automatic Stencil Acceleration
Framework for Optimized Hybrid Spatial and Temporal
Parallelism on {HBM}-based {FPGAs}",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "28:1--28:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572547",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3572547",
abstract = "Stencil computation is one of the fundamental
computing patterns in many application domains such as
scientific computing and image processing. While there
are promising studies that accelerate stencils on
FPGAs, there lacks an automated acceleration \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "28",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{M:2023:DAR,
author = "Dhayalakumar M. and Noor Mahammad Sk",
title = "Deterministic Approach for Range-enhanced
Reconfigurable Packet Classification Engine",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "29:1--29:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3586577",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3586577",
abstract = "Reconfigurable hardware is a promising technology for
implementing firewalls, routing mechanisms, and new
protocols for evolving high-performance network
systems. This work presents a novel deterministic
approach for a Range-enhanced Reconfigurable Packet
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "29",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Koch:2023:ISI,
author = "Andreas Koch and Wei Zhang",
title = "Introduction to the Special Issue on {FPT 2021}",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "30:1--30:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603701",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3603701",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "30",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Attia:2023:TSL,
author = "Sameh Attia and Vaughn Betz",
title = "Toward Software-like Debugging for {FPGAs} via
Checkpointing and Transaction-based Co-Simulation",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "31:1--31:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3552521",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3552521",
abstract = "Checkpoint-based debugging flows have recently been
developed that allow the user to move the design state
back and forth between an FPGA and a simulator. They
provide a softwarelike debugging experience by
combining the speed of hardware execution and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "31",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gebauer:2023:QMR,
author = "Richard Gebauer and Nick Karcher and Mehmed G{\"u}ler
and Oliver Sander",
title = "{QiCells}: a Modular {RFSoC}-based Approach to
Interface Superconducting Quantum Bits",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "32:1--32:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571820",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3571820",
abstract = "Quantum computers will be a revolutionary extension of
the heterogeneous computing world. They consist of many
quantum bits (qubits) and require a careful design of
the interface between the classical computer
architecture and the quantum processor. For \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "32",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Suh:2023:AHC,
author = "Han-Sok Suh and Jian Meng and Ty Nguyen and Vijay
Kumar and Yu Cao and Jae-Sun Seo",
title = "Algorithm--hardware Co-optimization for
Energy-efficient Drone Detection on
Resource-constrained {FPGA}",
journal = j-TRETS,
volume = "16",
number = "2",
pages = "33:1--33:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3583074",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Mon Jul 3 07:48:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3583074",
abstract = "Convolutional neural network (CNN)-based object
detection has achieved very high accuracy; e.g.,
single-shot multi-box detectors (SSDs) can efficiently
detect and localize various objects in an input image.
However, they require a high amount of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "33",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bucknall:2023:ZEE,
author = "Alex R. Bucknall and Suhaib A. Fahmy",
title = "{ZyPR}: End-to-end Build Tool and Runtime Manager for
Partial Reconfiguration of {FPGA SoCs} at the Edge",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "34:1--34:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3585521",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3585521",
abstract = "Partial reconfiguration (PR) is a key enabler to the
design and development of adaptive systems on modern
Field Programmable Gate Array (FPGA) Systems-on-Chip
(SoCs), allowing hardware to be adapted dynamically at
runtime. Vendor-supported PR \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "34",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Corts:2023:SPS,
author = "Reinout Corts and Nikolaos Alachiotis",
title = "A Survey of Processing Systems for Phylogenetics and
Population Genetics",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "35:1--35:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588033",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3588033",
abstract = "The COVID-19 pandemic brought Bioinformatics into the
spotlight, revealing that several existing methods,
algorithms, and tools were not well prepared to handle
large amounts of genomic data efficiently. This led to
prohibitively long execution times and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "35",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Machado:2023:NNH,
author = "Pedro Machado and Jo{\~a}o Filipe Ferreira and Andreas
Oikonomou and T. M. McGinnity",
title = "{NeuroHSMD}: Neuromorphic Hybrid Spiking Motion
Detector",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "36:1--36:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588318",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3588318",
abstract = "Vertebrate retinas are highly-efficient in processing
trivial visual tasks such as detecting moving objects,
which still represent complex challenges for modern
computers. In vertebrates, the detection of object
motion is performed by specialised retinal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "36",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Geethakumari:2023:SAC,
author = "Prajith Ramakrishnan Geethakumari and Ioannis
Sourdis",
title = "Stream Aggregation with Compressed Sliding {Windows}",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "37:1--37:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3590774",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3590774",
abstract = "High performance stream aggregation is critical for
many emerging applications that analyze massive volumes
of data. Incoming data needs to be stored in a sliding
window during processing, in case the aggregation
functions cannot be computed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "37",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Karakchi:2023:NND,
author = "Rasha Karakchi and Jason D. Bakos",
title = "{NAPOLY}: a Non-deterministic Automata Processor
{OverLaY}",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "38:1--38:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3593586",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3593586",
abstract = "Deterministic and Non-deterministic Finite Automata
(DFA and NFA) comprise the core of many big data
applications. Recent efforts to develop Domain-Specific
Architectures (DSAs) for DFA/NFA have taken divergent
approaches, but achieving consistent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "38",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Raut:2023:EAE,
author = "Gopal Raut and Saurabh Karkun and Santosh Kumar
Vishvakarma",
title = "An Empirical Approach to Enhance Performance for
Scalable {CORDIC}-Based Deep Neural Networks",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "39:1--39:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3596220",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3596220",
abstract = "Practical implementation of deep neural networks
(DNNs) demands significant hardware resources,
necessitating high computational power and memory
bandwidth. While existing field-programmable gate array
(FPGA)-based DNN accelerators are primarily optimized
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "39",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Leeser:2023:AEA,
author = "Miriam Leeser",
title = "Artifact Evaluation for {ACM TRETS} Papers Submitted
from the {FPT} Journal Track",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "40:1--40:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3596513",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3596513",
abstract = "Authors of papers that were accepted to ACM TRETS via
the FPT 2022 journal track had the option of
participating in Artifact Evaluation (AE). Four papers
from this track volunteered to participate in the AE
process. All of these papers have been awarded
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "40",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2023:FPF,
author = "Carol Jingyi Li and Xiangwei Li and Binglei Lou and
Craig T. Jin and David Boland and Philip H. W. Leong",
title = "Fixed-point {FPGA} Implementation of the {FFT}
Accumulation Method for Real-time Cyclostationary
Analysis",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "41:1--41:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567429",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3567429",
abstract = "The spectral correlation density (SCD) is an important
tool in cyclostationary signal detection and
classification. Even using efficient techniques based
on the fast Fourier transform (FFT), real-time
implementations are challenging because of the high
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "41",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lou:2023:FCF,
author = "Binglei Lou and David Boland and Philip Leong",
title = "{fSEAD}: a Composable {FPGA}-based Streaming Ensemble
Anomaly Detection Library",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "42:1--42:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3568992",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3568992",
abstract = "Machine learning ensembles combine multiple base
models to produce a more accurate output. They can be
applied to a range of machine learning problems,
including anomaly detection. In this article, we
investigate how to maximize the composability and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "42",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Shi:2023:DSE,
author = "Zhengyuan Shi and Cheng Chen and Gangqiang Yang and
Hailiang Xiong and Fudong Li and Honggang Hu and Zhiguo
Wan",
title = "Design Space Exploration of {Galois} and {Fibonacci}
Configuration Based on {Espresso} Stream Cipher",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "43:1--43:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567428",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fibquart.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3567428",
abstract = "Fibonacci and Galois are two different kinds of
configurations in stream ciphers. Although many
transformations between two configurations have been
proposed, there is no sufficient analysis of their FPGA
performance. Espresso stream cipher provides an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "43",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Mao:2023:HPC,
author = "Gaoyu Mao and Donglong Chen and Guangyan Li and
Wangchen Dai and Abdurrashid Ibrahim Sanka and
{\c{C}}etin Kaya Ko{\c{c}} and Ray C. C. Cheung",
title = "High-performance and Configurable {SW\slash HW}
Co-design of Post-quantum Signature
{CRYSTALS-Dilithium}",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "44:1--44:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569456",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3569456",
abstract = "CRYSTALS-Dilithium is a lattice-based post-quantum
digital signature scheme that is resistant to attacks
by quantum computers and has been selected to be
standardized in the NIST post-quantum cryptography
(PQC) standardization process. However, the speed
performance and design flexibility of the Dilithium
still need to be evaluated. This article presents a
high-performance software\slash hardware co-design of
CRYSTALS-Dilithium based on the NIST PQC round-3
parameters. High-speed pipelined hardware modules for
NTT\slash INTT, point-wise multiplication\slash
addition, and for SHAKE are included in the design to
accelerate the time-consuming operations in Dilithium.
All hardware modules are parameterized, thus allowing
full support of runtime configuration to increase
versatility. Moreover, the proposed software\slash
hardware architecture and tight operating workflows
reduce the data transmission overhead between the
processor and other hardware modules. The hardware
accelerator is implemented with a reconfigurable logic
on FPGA and is integrated with the high-performance ARM
Cortex-A9 processor in the Xilinx Zynq Architecture. We
measure the performance of the software\slash hardware
system for Dilithium in NIST security levels 2, 3, and
5. Compared to pure software implementations, we
achieve 8.7--12.5 times speedup in Key generation,
6.3--7.3 times speedup in Sign, and 9.1--12.2 times
speedup in Verify operations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "44",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{He:2023:FIC,
author = "Pengzhou He and Tianyou Bao and Jiafeng Xie and
Moeness Amin",
title = "{FPGA} Implementation of Compact Hardware Accelerators
for Ring-Binary-{LWE}-based Post-quantum Cryptography",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "45:1--45:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569457",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3569457",
abstract = "Post-quantum cryptography (PQC) has recently drawn
substantial attention from various communities owing to
the proven vulnerability of existing public-key
cryptosystems against the attacks launched from
well-established quantum computers. The
Ring-Binary-Learning-with-Errors (RBLWE), a variant of
Ring-LWE, has been proposed to build PQC for
lightweight applications. As more Field-Programmable
Gate Array (FPGA) devices are being deployed in
lightweight applications like Internet-of-Things (IoT)
devices, it would be interesting if the RBLWE-based PQC
can be implemented on the FPGA with ultra-low
complexity and flexible processing. However, thus far,
limited information is available for such
implementations. In this article, we propose novel
RBLWE-based PQC accelerators on the FPGA with ultra-low
implementation complexity and flexible timing. We first
present the process of deriving the key operation of
the RBLWE-based scheme into the proposed algorithmic
operation. The corresponding hardware accelerator is
then efficiently mapped from the proposed algorithm
with the help of algorithm-to-architecture
implementation techniques and extended to obtain
higher-throughput designs. The final complexity
analysis and implementation results (on a variety of
FPGAs) show that the proposed accelerators have
significantly smaller area-time complexities than the
state-of-the-art designs. Overall, the proposed
accelerators feature low implementation complexity and
flexible processing, making them desirable for emerging
FPGA-based lightweight applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "45",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jun:2023:ASD,
author = "Hyegang Jun and Hanchen Ye and Hyunmin Jeong and
Deming Chen",
title = "{AutoScaleDSE}: a Scalable Design Space Exploration
Engine for High-Level Synthesis",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "46:1--46:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572959",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3572959",
abstract = "High-Level Synthesis (HLS) has enabled users to
rapidly develop designs targeted for FPGAs from the
behavioral description of the design. However, to
synthesize an optimal design capable of taking better
advantage of the target FPGA, a considerable amount
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "46",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chang:2023:AHC,
author = "Liang Chang and Xin Zhao and Jun Zhou",
title = "{ADAS}: a High Computational Utilization Dynamic
Reconfigurable Hardware Accelerator for Super
Resolution",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "47:1--47:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570927",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3570927",
abstract = "Super-resolution (SR) based on deep learning has
obtained superior performance in image reconstruction.
Recently, various algorithm efforts have been committed
to improving image reconstruction quality and speed.
However, the inference of SR contains huge \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "47",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Skubich:2023:IRT,
author = "Christian Skubich and Peter Reichel and Marc
Reichenbach",
title = "Increasing the Robustness of {TERO-TRNGs} Against
Process Variation",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "48:1--48:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597418",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3597418",
abstract = "The transition effect ring oscillator is a popular
design for building entropy sources because it is
compact, built from digital elements only, and is very
well suited for FPGAs. However, it is known to be quite
sensitive to process variation. Although \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "48",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fiege:2023:BBS,
author = "Nicolai Fiege and Peter Zipf",
title = "{BLOOP}: {Boolean} Satisfiability-based Optimized Loop
Pipelining",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "49:1--49:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3599972",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3599972",
abstract = "Modulo scheduling is the premier technique for
throughput maximization of loops in high-level
synthesis by interleaving consecutive loop iterations.
The number of clock cycles between data insertions is
called the initiation interval (II). For throughput
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "49",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Arora:2023:CDC,
author = "Aman Arora and Atharva Bhamburkar and Aatman Borda and
Tanmay Anand and Rishabh Sehgal and Bagus Hanindhito
and Pierre-Emmanuel Gaillardon and Jaydeep Kulkarni and
Lizy K. John",
title = "{CoMeFa}: Deploying Compute-in-Memory on {FPGAs} for
Deep Learning Acceleration",
journal = j-TRETS,
volume = "16",
number = "3",
pages = "50:1--50:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603504",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Sat Aug 19 07:37:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3603504",
abstract = "Block random access memories (BRAMs) are the storage
houses of FPGAs, providing extensive on-chip memory
bandwidth to the compute units implemented using logic
blocks and digital signal processing slices. We propose
modifying BRAMs to convert them to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "50",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2023:ISS,
author = "Jing Li and Martin Herbordt",
title = "Introduction to the Special Section on {FCCM 2022}",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632092",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3632092",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "51",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wu:2023:TEA,
author = "Guiming Wu and Qianwen He and Jiali Jiang and
Zhenxiang Zhang and Yuan Zhao and Yinchao Zou and Jie
Zhang and Changzheng Wei and Ying Yan and Hui Zhang",
title = "{Topgun}: an {ECC} Accelerator for Private Set
Intersection",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603114",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3603114",
abstract = "Elliptic Curve Cryptography (ECC), one of the most
widely used asymmetric cryptographic algorithms, has
been deployed in Transport Layer Security (TLS)
protocol, blockchain, secure multiparty computation,
and so on. As one of the most secure ECC curves,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "52",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Xu:2023:FAG,
author = "Tiancheng Xu and Scott Rixner and Alan L. Cox",
title = "An {FPGA} Accelerator for Genome Variant Calling",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3595297",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3595297",
abstract = "In genome analysis, it is often important to identify
variants from a reference genome. However, identifying
variants that occur with low frequency can be
challenging, as it is computationally intensive to do
so accurately. LoFreq is a widely used program
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "53",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Josipovic:2023:RSD,
author = "Lana Josipovi{\'c} and Axel Marmet and Andrea
Guerrieri and Paolo Ienne",
title = "Resource Sharing in Dataflow Circuits",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597614",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3597614",
abstract = "To achieve resource-efficient hardware designs,
high-level synthesis (HLS) tools share (i.e.,
time-multiplex) functional units among operations of
the same type. This optimization is typically performed
in conjunction with operation scheduling to ensure
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "54",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Cheng:2023:PCF,
author = "Jianyi Cheng and Lana Josipovi{\'c} and John Wickerson
and George A. Constantinides",
title = "Parallelising Control Flow in Dynamic-scheduling
High-level Synthesis",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3599973",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3599973",
abstract = "Recently, there is a trend to use high-level synthesis
(HLS) tools to generate dynamically scheduled hardware.
The generated hardware is made up of components
connected using handshake signals. These handshake
signals schedule the components at runtime \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "55",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Ienne:2023:ISS,
author = "Paolo Ienne",
title = "Introduction to the Special Section on {FPGA 2022}",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "56:1--56:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3618114",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3618114",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "56",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wang:2023:LSL,
author = "Erwei Wang and Marie Auffret and Georgios-Ilias
Stavrou and Peter Y. K. Cheung and George A.
Constantinides and Mohamed S. Abdelfattah and James J.
Davis",
title = "Logic Shrinkage: Learned Connectivity Sparsification
for {LUT}-Based Neural Networks",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "57:1--57:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3583075",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3583075",
abstract = "Field-programmable gate array (FPGA)-specific deep
neural network (DNN) architectures using native lookup
tables (LUTs) as independently trainable inference
operators have been shown to achieve favorable
area-accuracy and energy-accuracy trade-offs. The
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "57",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gao:2023:RAR,
author = "Yizhao Gao and Song Wang and Hayden Kwok-Hay So",
title = "A Reconfigurable Architecture for Real-time
Event-based Multi-Object Tracking",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "58:1--58:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3593587",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3593587",
abstract = "Although advances in event-based machine vision
algorithms have demonstrated unparalleled capabilities
in performing some of the most demanding tasks, their
implementations under stringent real-time and power
constraints in edge systems remain a major \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "58",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Guo:2023:RAP,
author = "Licheng Guo and Pongstorn Maidee and Yun Zhou and
Chris Lavin and Eddie Hung and Wuxi Li and Jason Lau
and Weikang Qiao and Yuze Chi and Linghao Song and
Yuanlong Xiao and Alireza Kaviani and Zhiru Zhang and
Jason Cong",
title = "{RapidStream 2.0}: Automated Parallel Implementation
of Latency-Insensitive {FPGA} Designs Through Partial
Reconfiguration",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "59:1--59:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3593025",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3593025",
abstract = "Field-programmable gate arrays (FPGAs) require a much
longer compilation cycle than conventional computing
platforms such as CPUs. In this article, we shorten the
overall compilation time by co-optimizing the HLS
compilation (C-to-RTL) and the back-end \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "59",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nechi:2023:FBD,
author = "Anouar Nechi and Lukas Groth and Saleh Mulhem and
Farhad Merchant and Rainer Buchty and Mladen
Berekovic",
title = "{FPGA}-based Deep Learning Inference Accelerators:
Where Are We Standing?",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "60:1--60:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3613963",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3613963",
abstract = "Recently, artificial intelligence applications have
become part of almost all emerging technologies around
us. Neural networks, in particular, have shown
significant advantages and have been widely adopted
over other approaches in machine learning. In
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "60",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Leipnitz:2023:CAM,
author = "Marcos T. Leipnitz and Gabriel L. Nazar",
title = "Constraint-Aware Multi-Technique Approximate
High-Level Synthesis for {FPGAs}",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "61:1--61:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624481",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3624481",
abstract = "Numerous approximate computing (AC) techniques have
been developed to reduce the design costs in
error-resilient application domains, such as signal and
multimedia processing, data mining, machine learning,
and computer vision, to trade-off computation
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "61",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2023:CKC,
author = "Kenneth Liu and Alec Lu and Kartik Samtani and Zhenman
Fang and Licheng Guo",
title = "{CHIP-KNNv2}: a Configurable and High-Performance
{$K$}-Nearest Neighbors Accelerator on {HBM}-based
{FPGAs}",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "62:1--62:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3616873",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3616873",
abstract = "The k-nearest neighbors (KNN) algorithm is an
essential algorithm in many applications, such as
similarity search, image classification, and database
query. With the rapid growth in the dataset size and
the feature dimension of each data point, processing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "62",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Guo:2023:TST,
author = "Licheng Guo and Yuze Chi and Jason Lau and Linghao
Song and Xingyu Tian and Moazin Khatti and Weikang Qiao
and Jie Wang and Ecenur Ustun and Zhenman Fang and
Zhiru Zhang and Jason Cong",
title = "{TAPA}: a Scalable Task-parallel Dataflow Programming
Framework for Modern {FPGAs} with Co-optimization of
{HLS} and Physical Design",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "63:1--63:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3609335",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3609335",
abstract = "In this article, we propose TAPA, an end-to-end
framework that compiles a C++ task-parallel dataflow
program into a high-frequency FPGA accelerator.
Compared to existing solutions, TAPA has two major
advantages. First, TAPA provides a set of convenient
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "63",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lu:2023:HET,
author = "Yingchun Lu and Yun Yang and Rong Hu and Huaguo Liang
and Maoxiang Yi and Huang Zhengfeng and Yuanming Ma and
Tian Chen and Liang Yao",
title = "High-efficiency {TRNG} Design Based on Multi-bit
Dual-ring Oscillator",
journal = j-TRETS,
volume = "16",
number = "4",
pages = "64:1--64:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624991",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Fri Dec 22 06:11:49 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3624991",
abstract = "Unpredictable true random numbers are required in
security technology fields such as information
encryption, key generation, mask generation for
anti-side-channel analysis, algorithm initialization,
and so on. At present, the true random number generator
(TRNG) is not enough to provide fast random bits by
low-speed bits generation. Therefore, it is necessary
to design a faster TRNG. This work presents an
ultra-compact TRNG with high throughput based on a
novel extendable dual-ring oscillator (DRO). Owing to
multiple bits output per cycle in DRO can be used to
obtain the original random sequence, the proposed DRO
achieves a maximum resource utilization to build a more
efficient TRNG, compared with the conventional TRNG
system based on ring oscillator (RO), which only has a
single output and needs to build multiple groups of
ring oscillators. TRNG based on the 2-bit DRO and its
8-bit derivative structure has been verified on Xilinx
Artix-7 and Kintex-7 FPGA under the automatic layout
and routing and has achieved a throughput of 550 Mbps
and 1,100 Mbps, respectively. Moreover, in terms of
throughput performance over operating frequency,
hardware consumption, and entropy, the proposed scheme
has obvious advantages. Finally, the generated
sequences show good randomness in the test of NIST
SP800-22 and Dieharder test suite and pass the entropy
estimation test kit NIST SP800-90B and AIS-31.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "64",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Anupreetham:2024:HTF,
author = "Anupreetham Anupreetham and Mohamed Ibrahim and Mathew
Hall and Andrew Boutros and Ajay Kuzhively and Abinash
Mohanty and Eriko Nurvitadhi and Vaughn Betz and Yu Cao
and Jae-Sun Seo",
title = "High Throughput {FPGA}-Based Object Detection via
Algorithm-Hardware Co-Design",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634919",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3634919",
abstract = "Object detection and classification is a key task in
many computer vision applications such as smart
surveillance and autonomous vehicles. Recent advances
in deep learning have significantly improved the
quality of results achieved by these systems,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "1",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fan:2024:HDF,
author = "Zimeng Fan and Wei Hu and Fang Liu and Dian Xu and
Hong Guo and Yanxiang He and Min Peng",
title = "A Hardware Design Framework for Computer Vision Models
Based on Reconfigurable Devices",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635157",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3635157",
abstract = "In computer vision, the joint development of the
algorithm and computing dimensions cannot be separated.
Models and algorithms are constantly evolving, while
hardware designs must adapt to new or updated
algorithms. Reconfigurable devices are recognized
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "2",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Maschi:2024:SHS,
author = "Fabio Maschi and Gustavo Alonso",
title = "{Strega}: an {HTTP} Server for {FPGAs}",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3611312",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3611312",
abstract = "The computer architecture landscape is being reshaped
by the new opportunities, challenges, and constraints
brought by the cloud. On the one hand, high-level
applications profit from specialised hardware to boost
their performance and reduce deployment \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "3",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Qiu:2024:FFD,
author = "Yunhui Qiu and Yiqing Mao and Xuchen Gao and Sichao
Chen and Jiangnan Li and Wenbo Yin and Lingli Wang",
title = "{FDRA}: a Framework for a Dynamically Reconfigurable
Accelerator Supporting Multi-Level Parallelism",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3614224",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3614224",
abstract = "Coarse-grained reconfigurable architectures (CGRAs)
have emerged as promising accelerators due to their
high flexibility and energy efficiency. However,
existing open source works often lack integration of
CGRAs with CPU systems and corresponding \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "4",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Kalomiros:2024:HAS,
author = "John Kalomiros and John Vourvoulakis and Stavros
Vologiannidis",
title = "A Hardware Accelerator for the Semi-Global Matching
Stereo Algorithm: an Efficient Implementation for the
{Stratix V} and {Zynq UltraScale+} {FPGA} Technology",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3615869",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3615869",
abstract = "The semi-global matching stereo algorithm is a top
performing algorithm in stereo vision. The recursive
nature of the computations involved in this algorithm
introduces an inherent data dependency problem,
hindering the progressive computations of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "5",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Reis:2024:DDL,
author = "Miguel Reis and M{\'a}rio V{\'e}stias and Hor{\'a}cio
Neto",
title = "Designing Deep Learning Models on {FPGA} with Multiple
Heterogeneous Engines",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3615870",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3615870",
abstract = "Deep learning models are becoming more complex and
heterogeneous with new layer types to improve their
accuracy. This brings a considerable challenge to the
designers of accelerators of deep neural networks.
There have been several architectures and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "6",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{FaoDeMoura:2024:RNL,
author = "Rafael {F{\~a}o De Moura} and Luigi Carro",
title = "Reprogrammable Non-Linear Circuits Using {ReRAM} for
{NN} Accelerators",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617894",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3617894",
abstract = "As the massive usage of artificial intelligence
techniques spreads in the economy, researchers are
exploring new techniques to reduce the energy
consumption of Neural Network (NN) applications,
especially as the complexity of NNs continues to
increase. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "7",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Honorat:2024:ABS,
author = "Alexandre Honorat and Micka{\"e}l Dardaillon and Hugo
Miomandre and Jean-Fran{\c{c}}ois Nezan",
title = "Automated Buffer Sizing of Dataflow Applications in a
High-level Synthesis Workflow",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "8:1--8:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3626103",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3626103",
abstract = "High-Level Synthesis (HLS) tools are mature enough to
provide efficient code generation for computation
kernels on FPGA hardware. For more complex
applications, multiple kernels may be connected by a
dataflow graph. Although some tools, such as Xilinx
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "8",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Noyez:2024:MMS,
author = "Louis Noyez and Nadia {El Mrabet} and Olivier Potin
and Pascal Veron",
title = "{Montgomery} Multiplication Scalable Systolic Designs
Optimized for {DSP48E2}",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "9:1--9:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624571",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3624571",
abstract = "This article describes an extensive study of the use
of DSP48E2 Slices in Ultrascale FPGAs to design
hardware versions of the Montgomery Multiplication
algorithm for the hardware acceleration of modular
multiplications. Our fully scalable systolic \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "9",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Soleimani:2024:PCA,
author = "Parastoo Soleimani and David W. Capson and Kin Fun
Li",
title = "A Partitioned {CAM} Architecture with {FPGA}
Acceleration for Binary Descriptor Matching",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "10:1--10:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624749",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3624749",
abstract = "An efficient architecture for image descriptor
matching that uses a partitioned content-addressable
memory (CAM)-based approach is proposed. CAM is
frequently used in high-speed content-matching
applications. However, due to its lack of functionality
to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "10",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Weng:2024:TAS,
author = "Olivia Weng and Gabriel Marcano and Vladimir Loncar
and Alireza Khodamoradi and Abarajithan G. and Nojan
Sheybani and Andres Meza and Farinaz Koushanfar and
Kristof Denolf and Javier Mauricio Duarte and Ryan
Kastner",
title = "{Tailor}: Altering Skip Connections for
Resource-Efficient Inference",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "11:1--11:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624990",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3624990",
abstract = "Deep neural networks use skip connections to improve
training convergence. However, these skip connections
are costly in hardware, requiring extra buffers and
increasing on- and off-chip memory utilization and
bandwidth requirements. In this article, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "11",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hasler:2024:PAS,
author = "Jennifer Hasler and Cong Hao",
title = "Programmable Analog System Benchmarks Leading to
Efficient Analog Computation Synthesis",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "12:1--12:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625298",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3625298",
abstract = "This effort develops the first rich suite of analog
and mixed-signal benchmark of various sizes and
domains, intended for use with contemporary analog and
mixed-signal designs and synthesis tools. Benchmarking
enables analog-digital co-design exploration \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "12",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gohringer:2024:IFS,
author = "Diana G{\"o}hringer and Georgios Keramidas and Akash
Kumar",
title = "Introduction to the {FPL 2021} Special Section",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "13:1--13:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635115",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3635115",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "13",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Nikolic:2024:EFS,
author = "Stefan Nikoli{\'c} and Paolo Ienne",
title = "Exploring {FPGA} Switch-Blocks without Explicitly
Listing Connectivity Patterns",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "14:1--14:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597417",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3597417",
abstract = "Increased lower metal resistance makes physical
aspects of Field-Programmable Gate Array (FPGA)
switch-blocks more relevant than before. The need to
navigate a design space where each individual switch
can have significant impact on the FPGA's performance
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "14",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2024:EFB,
author = "Zhengyan Liu and Qiang Liu and Shun Yan and Ray C. C.
Cheung",
title = "An Efficient {FPGA}-based Depthwise Separable
Convolutional Neural Network Accelerator with Hardware
Pruning",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "15:1--15:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3615661",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3615661",
abstract = "Convolutional neural networks (CNNs) have been widely
deployed in computer vision tasks. However, the
computation and resource intensive characteristics of
CNN bring obstacles to its application on embedded
systems. This article proposes an efficient \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "15",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2024:EVL,
author = "Jeffrey Chen and Sang-Woo Jun and Sehwan Hong and
Warrick He and Jinyeong Moon",
title = "{Eciton}: Very Low-power Recurrent Neural Network
Accelerator for Real-time Inference at the Edge",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "16:1--16:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629979",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3629979",
abstract = "This article presents Eciton, a very low-power
recurrent neural network accelerator for time series
data within low-power edge sensor nodes, achieving
real-time inference with a power consumption of 17 mW
under load. Eciton reduces memory and chip \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "16",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sani:2024:EIU,
author = "Sajjad Rostami Sani and Andy Ye",
title = "Evaluating the Impact of Using Multiple-Metal Layers
on the Layout Area of Switch Blocks for Tile-Based
{FPGAs} in {FinFET} 7nm",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "17:1--17:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639055",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3639055",
abstract = "A new area model for estimating the layout area of
switch blocks is introduced in this work. The model is
based on a realistic layout strategy. As a result, it
not only takes into consideration the active area that
is needed to construct a switch block \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "17",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2024:ADC,
author = "Yonggen Li and Xin Li and Haibin Shen and Jicong Fan
and Yanfeng Xu and Kejie Huang",
title = "An All-digital Compute-in-memory {FPGA} Architecture
for Deep Learning Acceleration",
journal = j-TRETS,
volume = "17",
number = "1",
pages = "18:1--18:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640469",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Wed Mar 20 07:25:09 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3640469",
abstract = "Field Programmable Gate Array (FPGA) is a versatile
and programmable hardware platform, which makes it a
promising candidate for accelerating Deep Neural
Networks (DNNs). However, FPGA's computing energy
efficiency is low due to the domination of energy
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "18",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Koch:2024:ISI,
author = "Andreas Koch and Kentaro Sano",
title = "Introduction to the Special Issue on {FPL 2022}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "19:1--19:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3643474",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3643474",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "19",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jia:2024:XHP,
author = "Xijie Jia and Yu Zhang and Guangdong Liu and Xinlin
Yang and Tianyu Zhang and Jia Zheng and Dongdong Xu and
Zhuohuan Liu and Mengke Liu and Xiaoyang Yan and Hong
Wang and Rongzhang Zheng and Li Wang and Dong Li and
Satyaprakash Pareek and Jian Weng and Lu Tian and
Dongliang Xie and Hong Luo and Yi Shan",
title = "{XVDPU}: a High-Performance {CNN} Accelerator on the
Versal Platform Powered by the {AI} Engine",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "20:1--20:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617836",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3617836",
abstract = "Today, convolutional neural networks (CNNs) are widely
used in computer vision applications. However, the
trends of higher accuracy and higher resolution
generate larger networks. The requirements of
computation or I/O are the key bottlenecks. In this
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "20",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Xiao:2024:EEH,
author = "Yuanlong Xiao and Dongjoon Park and Zeyu Jason Niu and
Aditya Hota and Andr{\'e} Dehon",
title = "{ExHiPR}: Extended High-Level Partial Reconfiguration
for Fast Incremental {FPGA} Compilation",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617837",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3617837",
abstract = "Partial Reconfiguration (PR) is a key technique in the
application design on modern FPGAs. However, current PR
tools heavily rely on the developer to manually conduct
PR module definition, floorplanning, and flow control
at a low level. The existing PR \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "21",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Dann:2024:GSP,
author = "Jonas Dann and Daniel Ritter and Holger Fr{\"o}ning",
title = "{GraphScale}: Scalable Processing on {FPGAs} for {HBM}
and Large Graphs",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3616497",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3616497",
abstract = "Recent advances in graph processing on FPGAs promise
to alleviate performance bottlenecks with irregular
memory access patterns. Such bottlenecks challenge
performance for a growing number of important
application areas like machine learning and data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "22",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Khan:2024:OSD,
author = "Babar Khan and Carsten Heinz and Andreas Koch",
title = "The Open-source {DeLiBA2} Hardware\slash Software
Framework for Distributed Storage Accelerators",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "23:1--23:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624482",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3624482",
abstract = "With the trend towards ever larger ``big data''
applications, many of the gains achievable by using
specialized compute accelerators become diminished due
to the growing I/O overheads. While there have been
several research efforts into computational \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "23",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Trautmann:2024:DCE,
author = "Jens Trautmann and Paul Kr{\"u}ger and Andreas Becher
and Stefan Wildermann and J{\"u}rgen Teich",
title = "Design, Calibration, and Evaluation of Real-time
Waveform Matching on an {FPGA}-based Digitizer at {10
GS/s}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "24:1--24:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635719",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3635719",
abstract = "Digitizing side-channel signals at high sampling rates
produces huge amounts of data, while side-channel
analysis techniques only need those specific trace
segments containing Cryptographic Operations (COs). For
detecting these segments, waveform-matching \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "24",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Yang:2024:HQO,
author = "Geng Yang and Jie Lei and Zhenman Fang and Yunsong Li
and Jiaqing Zhang and Weiying Xie",
title = "{HyBNN}: Quantifying and Optimizing Hardware
Efficiency of Binary Neural Networks",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "25:1--25:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631610",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3631610",
abstract = "Binary neural network (BNN), where both the weight and
the activation values are represented with one bit,
provides an attractive alternative to deploy highly
efficient deep learning inference on
resource-constrained edge devices. However, our
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "25",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Albartus:2024:MPX,
author = "Nils Albartus and Maik Ender and Jan-Niklas M{\"o}ller
and Marc Fyrbiak and Christof Paar and Russell
Tessier",
title = "On the Malicious Potential of {Xilinx}'s Internal
Configuration Access Port {(ICAP)}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "26:1--26:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633204",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3633204",
abstract = "Field Programmable Gate Arrays (FPGAs) have become
increasingly popular in computing platforms. With
recent advances in bitstream format reverse
engineering, the scientific community has widely
explored static FPGA security threats. For example, it
is now \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "26",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Trochatos:2024:CCF,
author = "Theodoros Trochatos and Anthony Etim and Jakub
Szefer",
title = "Covert-channels in {FPGA}-enabled {SmartSSDs}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "27:1--27:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635312",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3635312",
abstract = "Cloud computing providers today offer access to a
variety of devices, which users can rent and access
remotely in a shared setting. Among these devices are
SmartSSDs, which are solid-state disks (SSD) augmented
with an FPGA, enabling users to instantiate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "27",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{DelSozzo:2024:ATS,
author = "Emanuele {Del Sozzo} and Davide Conficconi and Kentaro
Sano",
title = "Across Time and Space: {Senju}'s Approach for Scaling
Iterative Stencil Loop Accelerators on Single and
Multiple {FPGAs}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "28:1--28:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634920",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3634920",
abstract = "Stencil-based applications play an essential role in
high-performance systems as they occur in numerous
computational areas, such as partial differential
equation solving. In this context, Iterative Stencil
Loops (ISLs) represent a prominent and well-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "28",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Bao:2024:AFI,
author = "Tianyou Bao and Pengzhou He and Jiafeng Xie and H. S.
Jacinto",
title = "{AEKA}: {FPGA} Implementation of Area-Efficient
{Karatsuba} Accelerator for Ring-Binary-{LWE}-Based
Lightweight {PQC}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "29:1--29:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637215",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3637215",
abstract = "Lightweight PQC-related research and development have
gradually gained attention from the research community
recently. Ring-Binary-Learning-with-Errors
(RBLWE)-based encryption scheme (RBLWE-ENC), a
promising lightweight PQC based on small parameter sets
to fit related applications (but not in favor of
deploying popular fast algorithms like number theoretic
transform). To solve this problem, in this article, we
present a novel implementation of hardware acceleration
for RBLWE-ENC based on Karatsuba algorithm,
particularly on the field-programmable gate array
(FPGA) platform. In detail, we have proposed an
area-efficient Karatsuba Accelerator (AEKA) for
RBLWE-ENC, based on three layers of innovative efforts.
First of all, we reformulate the signal processing
sequence within the major arithmetic component of the
KA-based polynomial multiplication for RBLWE-ENC to
obtain a new algorithm. Then, we have designed the
proposed algorithm into a new hardware accelerator with
several novel algorithm-to-architecture mapping
techniques. Finally, we have conducted thorough
complexity analysis and comparison to demonstrate the
efficiency of the proposed accelerator, e.g., it
involves 62.5\% higher throughput and 60.2\% less
area-delay product (ADP) than the state-of-the-art
design for $ n = 512 $ (Virtex-7 device, similar
setup). The proposed AEKA design strategy is highly
efficient on the FPGA devices, i.e., small resource
usage with superior timing, which can be integrated
with other necessary systems for lightweight-oriented
high-performance applications (e.g., servers). The
outcome of this work is also expected to generate
impacts for lightweight PQC advancement.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "29",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hossfeld:2024:HEC,
author = "Konstantin Ho{\ss}feld and Hans Jakob Damsgaard and
Jar Nurmi and Michaela Blott and Thomas B.
Preu{\ss}er",
title = "High-efficiency Compressor Trees for Latest {AMD
FPGAs}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "30:1--30:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3645097",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3645097",
abstract = "High-fan-in dot product computations are ubiquitous in
highly relevant application domains, such as signal
processing and machine learning. Particularly, the
diverse set of data formats used in machine learning
poses a challenge for flexible efficient \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "30",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Sahoo:2024:ADF,
author = "Siva Satyendra Sahoo and Salim Ullah and Akash Kumar",
title = "{AxOMaP}: Designing {FPGA}-based Approximate
Arithmetic Operators using Mathematical Programming",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "31:1--31:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3648694",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3648694",
abstract = "With the increasing application of machine learning
(ML) algorithms in embedded systems, there is a rising
necessity to design low-cost computer arithmetic for
these resource-constrained systems. As a result,
emerging models of computation, such as approximate and
stochastic computing, that leverage the inherent
error-resilience of such algorithms are being actively
explored for implementing ML inference on
resource-constrained systems. Approximate computing
(AxC) aims to provide disproportionate gains in the
power, performance, and area (PPA) of an application by
allowing some level of reduction in its behavioral
accuracy (BEHAV). Using approximate operators (AxOs)
for computer arithmetic forms one of the more prevalent
methods of implementing AxC. AxOs provide the
additional scope for finer granularity of optimization,
compared to only precision scaling of computer
arithmetic. To this end, the design of
platform-specific and cost-efficient approximate
operators forms an important research goal. Recently,
multiple works have reported the use of AI\slash
ML-based approaches for synthesizing novel FPGA-based
AxOs. However, most of such works limit the use of
AI/ML to designing ML-based surrogate functions that
are used during iterative optimization processes. To
this end, we propose a novel data analysis-driven
mathematical programming-based approach to synthesizing
approximate operators for FPGAs. Specifically, we
formulate mixed integer quadratically constrained
programs based on the results of correlation analysis
of the characterization data and use the solutions to
enable a more directed search approach for evolutionary
optimization algorithms. Compared to traditional
evolutionary algorithms-based optimization, we report
up to 21\% improvement in the hypervolume, for joint
optimization of PPA and BEHAV, in the design of signed
8-bit multipliers. Further, we report up to 27\% better
hypervolume than other state-of-the-art approaches to
DSE for FPGA-based application-specific AxOs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "31",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Li:2024:SHP,
author = "Kexin Li and Shaoxian Xu and Zhiyuan Shao and Ran
Zheng and Xiaofei Liao and Hai Jin",
title = "{ScalaBFS2}: a High-performance {BFS} Accelerator on
an {HBM}-enhanced {FPGA} Chip",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "32:1--32:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3650037",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3650037",
abstract = "The introduction of High Bandwidth Memory (HBM) to the
FPGA chip makes it possible for an FPGA-based
accelerator to leverage the huge memory bandwidth of
HBM to improve its performance when implementing a
specific algorithm, which is especially true for
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "32",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Keilbart:2024:DIC,
author = "Chris Keilbart and Yuhui Gao and Martin Chua and Eric
Matthews and Steven J. E. Wilton and Lesley Shannon",
title = "Designing an {IEEE}-Compliant {FPU} that Supports
Configurable Precision for Soft Processors",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "33:1--33:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3650036",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3650036",
abstract = "Field Programmable Gate Arrays (FPGAs) are commonly
used to accelerate floating-point (FP) applications.
Although researchers have extensively studied FPGA FP
implementations, existing work has largely focused on
standalone operators and frequency-optimized designs.
These works are not suitable for FPGA soft processors
which are more sensitive to latency, impose a lower
frequency ceiling, and require IEEE FP standard
compliance. We present an open-source floating-point
unit (FPU) for FPGA RISC-V soft processors that is
fully IEEE compliant with configurable levels of FP
precision. Our design emphasizes runtime performance
with 25% lower latency in the most common instructions
compared to previous works while maintaining efficient
resource utilization.\par
Our FPU also allows users to explore various mantissa
widths without having to rewrite or recompile their
algorithms. We use this to investigate the scalability
of our reduced-precision FPU across numerous
microbenchmark functions as well as more complex case
studies. Our experiments show that applications like
the discrete cosine transformation and the
Black--Scholes model can realize a speedup of more than
1.35x in conjunction with a 43% and 35% reduction in
lookup table and flip-flop resources while experiencing
less than a 0.025\% average loss in numerical accuracy
with a 16-bit mantissa width.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "33",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{deBruin:2024:RBE,
author = "Barry de Bruin and Kanishkan Vadivel and Mark
Wijtvliet and Pekka J{\"a}{\"a}skel{\"a}inen and Henk
Corporaal",
title = "{R-Blocks}: an Energy-Efficient, Flexible, and
Programmable {CGRA}",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "34:1--34:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3656642",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3656642",
abstract = "Emerging data-driven applications in the embedded,
e-Health, and internet of things (IoT) domain require
complex on-device signal analysis and data reduction to
maximize energy efficiency on these energy-constrained
devices. Coarse-grained reconfigurable \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "34",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Chen:2024:HNF,
author = "Sichao Chen and Chang Cai and Su Zheng and Jiangnan Li
and Guowei Zhu and Jingyuan Li and Yazhou Yan and Yuan
Dai and Wenbo Yin and Lingli Wang",
title = "{HierCGRA}: a Novel Framework for Large-scale {CGRA}
with Hierarchical Modeling and Automated Design Space
Exploration",
journal = j-TRETS,
volume = "17",
number = "2",
pages = "35:1--35:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3656176",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Jun 4 06:09:07 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3656176",
abstract = "Coarse-grained reconfigurable arrays (CGRAs) are
promising design choices in computation-intensive
domains, since they can strike a balance between energy
efficiency and flexibility. A typical CGRA comprises
processing elements (PEs) that can execute \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "35",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Campos:2024:EEC,
author = "Javier Campos and Jovan Mitrevski and Nhan Tran and
Zhen Dong and Amir Gholaminejad and Michael W. Mahoney
and Javier Duarte",
title = "End-to-end codesign of {Hessian}-aware quantized
neural networks for {FPGAs}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "36:1--36:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3662000",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3662000",
abstract = "We develop an end-to-end workflow for the training and
implementation of co-designed neural networks (NNs) for
efficient field-programmable gate array (FPGA)
hardware. Our approach leverages Hessian-aware
quantization of NNs, the Quantized Open Neural
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "36",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Haase:2024:NLE,
author = "Julian Haase and Najdet Charaf and Alexander Gro{\ss}
and Diana G{\"o}hringer",
title = "{NC-Library}: Expanding {SystemC} Capabilities for
Nested {reConfigurable} Hardware Modelling",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "37:1--37:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3662001",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3662001",
abstract = "As runtime reconfiguration is used in an increasing
number of hardware architectures, new simulation and
modeling tools are needed to support the developer
during the design phases. In this article, a language
extension for SystemC is presented, together \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "37",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Xu:2024:ESA,
author = "Shiyao Xu and Jingfei Jiang and Jinwei Xu and Xifu
Qian",
title = "Efficient {SpMM} Accelerator for Deep Learning:
Sparkle and Its Automated Generator",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "38:1--38:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665896",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3665896",
abstract = "Deep learning (DL) technology has made breakthroughs
in a wide range of intelligent tasks, such as vision,
language, recommendation systems, and so on. Sparse
matrix multiplication (SpMM) is the key computation
kernel of most sparse models. Conventional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "38",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Lu:2024:SAA,
author = "Alec Lu and Jahanvi Narendra Agrawal and Zhenman
Fang",
title = "{SQL2FPGA}: Automated Acceleration of {SQL} Query
Processing on Modern {CPU-FPGA} Platforms",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "39:1--39:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674843",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3674843",
abstract = "Today's big data query engines are constantly under
pressure to keep up with the rapidly increasing demand
for faster processing of more complex workloads. In the
past few years, FPGA-based database acceleration
efforts have demonstrated promising \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "39",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Hirtum:2024:CND,
author = "Lennart {Van Hirtum} and Patrick {De Causmaecker} and
Jens Goemaere and Tobias Kenter and Heinrich Riebler
and Michael Lass and Christian Plessl",
title = "A Computation of the Ninth {Dedekind} Number Using
{FPGA} Supercomputing",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "40:1--40:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674147",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3674147",
abstract = "This manuscript makes the claim of having computed the
\(9\)th Dedekind number, D(9). This was done by
accelerating the core operation of the process with an
efficient FPGA design that outperforms an optimized
64-core CPU reference by 95 \(\times\). \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "40",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Carril:2024:HAH,
author = "Xavier Carril and Charalampos Kardaris and Jordi
Ribes-Gonz{\'a}Lez and Oriol Farr{\`a}s and Carles
Hernandez and Vatistas Kostalabros and Joel Ulises
Gonz{\'a}lez-Jim{\'e}nez and Miquel Moret{\'o}",
title = "Hardware Acceleration for High-Volume Operations of
{CRYSTALS-Kyber} and {CRYSTALS-Dilithium}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "41:1--41:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3675172",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3675172",
abstract = "Many high-demand digital services need to perform
several cryptographic operations, such as key exchange
or security credentialing, in a concise amount of time.
In turn, the security of some of these cryptographic
schemes is threatened by advances in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "41",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Khatti:2024:PPA,
author = "Moazin Khatti and Xingyu Tian and Ahmad Sedigh
Baroughi and Akhil Raj Baranwal and Yuze Chi and
Licheng Guo and Jason Cong and Zhenman Fang",
title = "{PASTA}: Programming and Automation Support for
Scalable Task-Parallel {HLS} Programs on Modern
Multi-Die {FPGAs}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "42:1--42:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3676849",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3676849",
abstract = "In recent years, the adoption of FPGAs in datacenters
has increased, with a growing number of users choosing
High-Level Synthesis (HLS) as their preferred
programming method. While HLS simplifies FPGA
programming, one notable challenge arises when scaling
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "42",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Fahmy:2024:ISS,
author = "Suhaib A. Fahmy and Jason D. Bakos",
title = "Introduction to the Special Section on {FPGA 2023}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "43:1--43:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3695841",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3695841",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "43",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Gribok:2024:CCP,
author = "Sergey Gribok and Bogdan Pasca and Martin Langhammer",
title = "{CSAIL2019} Crypto-Puzzle Solver Architecture",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "44:1--44:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639056",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3639056",
abstract = "The CSAIL2019 time-lock puzzle is an unsolved
cryptographic challenge introduced by Ron Rivest in
2019, replacing the solved LCS35 puzzle. Solving these
types of puzzles requires large amounts of
intrinsically sequential computations, with each
iteration \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "44",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wong:2024:DDF,
author = "Linus Y. Wong and Jialiang Zhang and Jing Li",
title = "{DONGLE 2.0}: Direct {FPGA}-Orchestrated {NVMe}
Storage for {HLS}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "45:1--45:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3650038",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3650038",
abstract = "Rapid growth in data size poses significant
computational and memory challenges to data processing.
FPGA accelerators and near-storage processing have
emerged as compelling solutions for tackling the
growing computational and memory requirements. Many
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "45",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Liu:2024:FAL,
author = "Chaoqiang Liu and Xiaofei Liao and Long Zheng and Yu
Huang and Haifeng Liu and Yi Zhang and Haiheng He and
Haoyan Huang and Jingyi Zhou and Hai Jin",
title = "{L-FNNG}: Accelerating Large-Scale {KNN} Graph
Construction on {CPU--FPGA} Heterogeneous Platform",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "46:1--46:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3652609",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3652609",
abstract = "Due to the high complexity of constructing exact k
-nearest neighbor graphs, approximate construction has
become a popular research topic. The NN-Descent
algorithm is one of the representative in-memory
algorithms. To effectively handle large datasets,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "46",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Du:2024:FFA,
author = "Linfeng Du and Tingyuan Liang and Xiaofeng Zhou and
Jinming Ge and Shangkun Li and Sharad Sinha and Jieru
Zhao and Zhiyao Xie and Wei Zhang",
title = "{FADO}: Floorplan-Aware Directive Optimization Based
on Synthesis and Analytical Models for High-Level
Synthesis Designs on Multi-Die {FPGAs}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "47:1--47:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3653458",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3653458",
abstract = "Multi-die FPGAs are widely adopted for large-scale
accelerators, but optimizing high-level synthesis
designs on these FPGAs faces two challenges. First, the
delay caused by die-crossing nets creates an NP-hard
floorplanning problem. Second, traditional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "47",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Jaiyeoba:2024:DAD,
author = "Oluwole Jaiyeoba and Kevin Skadron",
title = "{Dynamic-ACTS} --- A Dynamic Graph Analytics
Accelerator For {HBM}-Enabled {FPGAs}",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "48:1--48:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3662002",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3662002",
abstract = "Graph processing frameworks suffer performance
degradation from under-utilization of available memory
bandwidth, because graph traversal often exhibits poor
locality. A prior work, ACTS [ 24 ], accelerates graph
processing with FPGAs and High Bandwidth \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "48",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Drewes:2024:TTL,
author = "Colin Drewes and Tyler Sheaves and Olivia Weng and
Keegan Ryan and Bill Hunter and Christopher McCarty and
Ryan Kastner and Dustin Richmond",
title = "Turn on, Tune in, and Listen up: Maximizing
Side-Channel Recovery in Cross-Platform Time-to-Digital
Converters",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "49:1--49:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3666092",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3666092",
abstract = "Voltage fluctuation sensors measure minute changes in
an FPGA power distribution network, allowing attackers
to extract information from concurrently executing
computations. Previous voltage fluctuation sensors make
assumptions about the co-tenant \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "49",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Wilson:2024:IFT,
author = "Andrew Elbert Wilson and Nathan Baker and Ethan
Campbell and Michael Wirthlin",
title = "Improving Fault Tolerance for {FPGA SoCs} through
Post-Radiation Design Analysis",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "50:1--50:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674841",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3674841",
abstract = "FPGAs have been shown to operate reliably within harsh
radiation environments by employing single-event upset
(SEU) mitigation techniques, such as configuration
scrubbing, triple-modular redundancy, error correction
coding, and radiation aware \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "50",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}
@Article{Zhuang:2024:CCH,
author = "Jinming Zhuang and Jason Lau and Hanchen Ye and
Zhuoping Yang and Shixin Ji and Jack Lo and Kristof
Denolf and Stephen Neuendorffer and Alex Jones and
Jingtong Hu and Yiyu Shi and Deming Chen and Jason Cong
and Peipei Zhou",
title = "{CHARM 2.0}: Composing Heterogeneous Accelerators for
Deep Learning on Versal {ACAP} Architecture",
journal = j-TRETS,
volume = "17",
number = "3",
pages = "51:1--51:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3686163",
ISSN = "1936-7406 (print), 1936-7414 (electronic)",
ISSN-L = "1936-7406",
bibdate = "Tue Oct 1 11:41:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/trets.bib",
URL = "https://dl.acm.org/doi/10.1145/3686163",
abstract = "Dense matrix multiply (MM) serves as one of the most
heavily used kernels in deep learning applications. To
cope with the high computation demands of these
applications, heterogeneous architectures featuring
both FPGA and dedicated ASIC accelerators have
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Reconfigurable Technol. Syst.",
articleno = "51",
fjournal = "ACM Transactions on Reconfigurable Technology and
Systems (TRETS)",
journal-URL = "https://dl.acm.org/loi/trets",
}