@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-TACO = "ACM Transactions on Architecture and
Code Optimization"}
@Article{Calder:2004:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "1",
number = "1",
pages = "1--2",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2004:RIC,
author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
and N. Vijaykrishnan and M. J. Irwin",
title = "Reducing instruction cache energy consumption using a
compiler-based strategy",
journal = j-TACO,
volume = "1",
number = "1",
pages = "3--33",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Isailovic:2004:DCQ,
author = "Nemanja Isailovic and Mark Whitney and Yatish Patel
and John Kubiatowicz and Dean Copsey and Frederic T.
Chong and Isaac L. Chuang and Mark Oskin",
title = "Datapath and control for quantum wires",
journal = j-TACO,
volume = "1",
number = "1",
pages = "34--61",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sankaralingam:2004:TPA,
author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and
Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
Ranganathan and Doug Burger and Stephen W. Keckler and
Robert G. McDonald and Charles R. Moore",
title = "{TRIPS}: a polymorphous architecture for exploiting
{ILP}, {TLP}, and {DLP}",
journal = j-TACO,
volume = "1",
number = "1",
pages = "62--93",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Skadron:2004:TAM,
author = "Kevin Skadron and Mircea R. Stan and Karthik
Sankaranarayanan and Wei Huang and Sivakumar Velusamy
and David Tarjan",
title = "Temperature-aware microarchitecture: {Modeling} and
implementation",
journal = j-TACO,
volume = "1",
number = "1",
pages = "94--125",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Aleta:2004:RCC,
author = "Alex Alet{\`a} and Josep M. Codina and Antonio
Gonz{\'a}lez and David Kaeli",
title = "Removing communications in clustered
microarchitectures through instruction replication",
journal = j-TACO,
volume = "1",
number = "2",
pages = "127--151",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bai:2004:LPO,
author = "Yu Bai and R. Iris Bahar",
title = "A low-power in-order\slash out-of-order issue queue",
journal = j-TACO,
volume = "1",
number = "2",
pages = "152--179",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Juang:2004:IBP,
author = "Philo Juang and Kevin Skadron and Margaret Martonosi
and Zhigang Hu and Douglas W. Clark and Philip W.
Diodato and Stefanos Kaxiras",
title = "Implementing branch-predictor decay using quasi-static
memory cells",
journal = j-TACO,
volume = "1",
number = "2",
pages = "180--219",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Santana:2004:LCF,
author = "Oliverio J. Santana and Alex Ramirez and Josep L.
Larriba-Pey and Mateo Valero",
title = "A low-complexity fetch architecture for
high-performance superscalar processors",
journal = j-TACO,
volume = "1",
number = "2",
pages = "220--245",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2004:CFS,
author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun
Chan",
title = "A compiler framework for speculative optimizations",
journal = j-TACO,
volume = "1",
number = "3",
pages = "247--271",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fields:2004:ICS,
author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill
and Chris J. Newburn",
title = "Interaction cost and shotgun profiling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "272--304",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sankaranarayanan:2004:PBA,
author = "Karthik Sankaranarayanan and Kevin Skadron",
title = "Profile-based adaptation for cache decay",
journal = j-TACO,
volume = "1",
number = "3",
pages = "305--322",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xie:2004:IDV,
author = "Fen Xie and Margaret Martonosi and Sharad Malik",
title = "Intraprogram dynamic voltage scaling: {Bounding}
opportunities with analytic modeling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "323--367",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hartstein:2004:OPD,
author = "A. Hartstein and Thomas R. Puzak",
title = "The optimum pipeline depth considering both power and
performance",
journal = j-TACO,
volume = "1",
number = "4",
pages = "369--388",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cristal:2004:TKI,
author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
Valero and Jos{\'e} F. Mart{\'\i}nez",
title = "Toward kilo-instruction processors",
journal = j-TACO,
volume = "1",
number = "4",
pages = "389--417",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Akkary:2004:ARE,
author = "Haitham Akkary and Ravi Rajwar and Srikanth T.
Srinivasan",
title = "An analysis of a resource efficient checkpoint
architecture",
journal = j-TACO,
volume = "1",
number = "4",
pages = "418--444",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2004:TML,
author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
and Chien-Hao Lee",
title = "Tolerating memory latency through push prefetching for
pointer-intensive applications",
journal = j-TACO,
volume = "1",
number = "4",
pages = "445--475",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Calder:2005:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "2",
number = "1",
pages = "1--2",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2005:EFA,
author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
and Josep Torrellas",
title = "Efficient and flexible architectural support for
dynamic monitoring",
journal = j-TACO,
volume = "2",
number = "1",
pages = "3--33",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2005:WHC,
author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
Najjar",
title = "A way-halting cache for low-energy high-performance
systems",
journal = j-TACO,
volume = "2",
number = "1",
pages = "34--54",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abella:2005:ISP,
author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
and Michael F. P. O'Boyle",
title = "{IATAC}: a smart predictor to turn-off {L2} cache
lines",
journal = j-TACO,
volume = "2",
number = "1",
pages = "55--77",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haskins:2005:AWS,
author = "John W. {Haskins, Jr.} and Kevin Skadron",
title = "Accelerated warmup for sampled microarchitecture
simulation",
journal = j-TACO,
volume = "2",
number = "1",
pages = "78--108",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2005:ABT,
author = "Tao Li and Ravi Bhargava and Lizy Kurian John",
title = "Adapting branch-target buffer to improve the target
predictability of {Java} code",
journal = j-TACO,
volume = "2",
number = "2",
pages = "109--130",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2005:DIE,
author = "Lingli Zhang and Chandra Krintz",
title = "The design, implementation, and evaluation of adaptive
code unloading for resource-constrained devices",
journal = j-TACO,
volume = "2",
number = "2",
pages = "131--164",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kulkarni:2005:FES,
author = "Prasad A. Kulkarni and Stephen R. Hines and David B.
Whalley and Jason D. Hiser and Jack W. Davidson and
Douglas L. Jones",
title = "Fast and efficient searches for effective
optimization-phase sequences",
journal = j-TACO,
volume = "2",
number = "2",
pages = "165--198",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Salami:2005:DMI,
author = "Esther Salam{\'\i} and Mateo Valero",
title = "Dynamic memory interval test vs. interprocedural
pointer analysis in multimedia applications",
journal = j-TACO,
volume = "2",
number = "2",
pages = "199--219",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Meng:2005:ELL,
author = "Yan Meng and Timothy Sherwood and Ryan Kastner",
title = "Exploring the limits of leakage power reduction in
caches",
journal = j-TACO,
volume = "2",
number = "3",
pages = "221--246",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Garzaran:2005:TBS,
author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
Vi{\~n}als and Lawrence Rauchwerger and Josep
Torrellas",
title = "Tradeoffs in buffering speculative memory state for
thread-level speculation in multiprocessors",
journal = j-TACO,
volume = "2",
number = "3",
pages = "247--279",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tarjan:2005:MPG,
author = "David Tarjan and Kevin Skadron",
title = "Merging path and gshare indexing in perceptron branch
prediction",
journal = j-TACO,
volume = "2",
number = "3",
pages = "280--300",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2005:WET,
author = "Xiangyu Zhang and Rajiv Gupta",
title = "Whole execution traces and their applications",
journal = j-TACO,
volume = "2",
number = "3",
pages = "301--334",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2005:IWA,
author = "Wankang Zhao and David Whalley and Christopher Healy
and Frank Mueller",
title = "Improving {WCET} by applying a {WC} code-positioning
optimization",
journal = j-TACO,
volume = "2",
number = "4",
pages = "335--365",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "WC (worst case); WCET (worst case execution time)",
}
@Article{Reis:2005:SCF,
author = "George A. Reis and Jonathan Chang and Neil
Vachharajani and Ram Rangan and David I. August and
Shubhendu S. Mukherjee",
title = "Software-controlled fault tolerance",
journal = j-TACO,
volume = "2",
number = "4",
pages = "366--396",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2005:PPC,
author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
title = "Power-performance considerations of parallel computing
on chip multiprocessors",
journal = j-TACO,
volume = "2",
number = "4",
pages = "397--422",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sharma:2005:SPE,
author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
title = "Spectral prefetcher: {An} effective mechanism for {L2}
cache prefetching",
journal = j-TACO,
volume = "2",
number = "4",
pages = "423--450",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Calder:2006:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "3",
number = "1",
pages = "1--2",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tan:2006:BSS,
author = "Lin Tan and Brett Brotherton and Timothy Sherwood",
title = "Bit-split string-matching engines for intrusion
detection and prevention",
journal = j-TACO,
volume = "3",
number = "1",
pages = "3--34",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nagpurkar:2006:ERP,
author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
and Timothy Sherwood",
title = "Efficient remote profiling for resource-constrained
devices",
journal = j-TACO,
volume = "3",
number = "1",
pages = "35--66",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2006:RCG,
author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
Dz-Ching Ju and Tin-Fook Ngai",
title = "Recovery code generation for general speculative
optimizations",
journal = j-TACO,
volume = "3",
number = "1",
pages = "67--89",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Choi:2006:ORR,
author = "Yoonseo Choi and Hwansoo Han",
title = "Optimal register reassignment for register stack
overflow minimization",
journal = j-TACO,
volume = "3",
number = "1",
pages = "90--114",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xue:2006:LOA,
author = "Jingling Xue and Qiong Cai",
title = "A lifetime optimal algorithm for speculative {PRE}",
journal = j-TACO,
volume = "3",
number = "2",
pages = "115--155",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sharkey:2006:IPT,
author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
Ghose and Oguz Ergin",
title = "Instruction packing: {Toward} fast and
energy-efficient instruction scheduling",
journal = j-TACO,
volume = "3",
number = "2",
pages = "156--181",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ceze:2006:CUC,
author = "Luis Ceze and Karin Strauss and James Tuck and Josep
Torrellas and Jose Renau",
title = "{CAVA}: {Using} checkpoint-assisted value prediction
to hide {L2} misses",
journal = j-TACO,
volume = "3",
number = "2",
pages = "182--208",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2006:EAR,
author = "Lixin Zhang and Mike Parker and John Carter",
title = "Efficient address remapping in distributed
shared-memory systems",
journal = j-TACO,
volume = "3",
number = "2",
pages = "209--229",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2006:ATP,
author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
title = "An approach toward profit-driven optimization",
journal = j-TACO,
volume = "3",
number = "3",
pages = "231--262",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162691",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although optimizations have been applied for a number
of years to improve the performance of software,
problems with respect to the application of
optimizations have not been adequately addressed. For
example, in certain circumstances, optimizations may
degrade performance. However, there is no efficient way
to know when a degradation will occur. In this
research, we investigate the profitability of
optimizations, which is useful for determining the
benefit of applying optimizations. We develop a
framework that enables us to predict profitability
using analytic models. The profitability of an
optimization depends on code context, the particular
optimization, and machine resources. Thus, our
framework has analytic models for each of these
components. As part of the framework, there is also a
profitability engine that uses models to predict the
profit. In this paper, we target scalar optimizations
and, in particular, describe the models for partial
redundancy elimination (PRE), loop invariant code
motion (LICM), and value numbering (VN). We implemented
the framework for predicting the profitability of these
optimizations. Based on the predictions, we can
selectively apply profitable optimizations. We compared
the profit-driven approach with an approach that uses a
heuristic in deciding when optimizations should be
applied. Our experiments demonstrate that the
profitability of scalar optimizations can be accurately
predicted by using models. That is, without actually
applying a scalar optimization, we can determine if an
optimization is beneficial and should be applied.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hazelwood:2006:MBC,
author = "Kim Hazelwood and Michael D. Smith",
title = "Managing bounded code caches in dynamic binary
optimization systems",
journal = j-TACO,
volume = "3",
number = "3",
pages = "263--294",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162692",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic binary optimizers store altered copies of
original program instructions in software-managed code
caches in order to maximize reuse of transformed code.
Code caches store code blocks that may vary in size,
reference other code blocks, and carry a high
replacement overhead. These unique constraints reduce
the effectiveness of conventional cache management
policies. Our work directly addresses these unique
constraints and presents several contributions to the
code-cache management problem. First, we show that
evicting more than the minimum number of code blocks
from the code cache results in less run-time overhead
than the existing alternatives. Such granular evictions
reduce overall execution time, as the fixed costs of
invoking the eviction mechanism are amortized across
multiple cache insertions. Second, a study of the ideal
lifetimes of dynamically generated code blocks
illustrates the benefit of a replacement algorithm
based on a generational heuristic. We describe and
evaluate a generational approach to code cache
management that makes it easy to identify long-lived
code blocks and simultaneously avoid any fragmentation
because of the eviction of short-lived blocks. Finally,
we present results from an implementation of our
generational approach in the DynamoRIO framework and
illustrate that, as dynamic optimization systems become
more prevalent, effective code cache-management
policies will be essential for reliable, scalable
performance of modern applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rochecouste:2006:CCE,
author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
Seznec",
title = "A case for a complexity-effective, width-partitioned
microarchitecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "295--326",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The analysis of program executions reveals that most
integer and multimedia applications make heavy use of
narrow-width operations, i.e., instructions exclusively
using narrow-width operands and producing a
narrow-width result. Moreover, this usage is relatively
well distributed over the application. We observed this
program property on the MediaBench and SPEC2000
benchmarks with about 40\% of the instructions being
narrow-width operations. Current superscalar processors
use 64-bit datapaths to execute all the instructions of
the applications. In this paper, we suggest the use of
a width-partitioned microarchitecture (WPM) to master
the hardware complexity of a superscalar processor. For
a four-way issue machine, we split the processor in two
two-way clusters: the main cluster executing 64-bit
operations, load/store, and complex operations and a
narrow cluster executing the 16-bit operations. We
resort to partitioning to decouple the treatment of the
narrow-width operations from that of the other program
instructions. This provides the benefit of greatly
simplifying the design of the critical processor
components in each cluster (e.g., the register file and
the bypass network). The dynamic interleaving of the
two instruction types allows maintaining the workload
balanced among clusters. WPM also helps to reduce the
complexity of the interconnection fabric and of the
issue logic. In fact, since the 16-bit cluster can only
communicate narrow-width data, the datapath-width of
the interconnect fabric can be significantly reduced,
yielding a corresponding saving of the interconnect
power and area. We explore different possible
configurations of WPM, discussing the various
implementation tradeoffs. We also examine a speculative
steering heuristic to distribute the narrow-width
operations among clusters. A detailed analysis of the
complexity factors shows using WPM instead of a
classical 64-bit two-cluster microarchitecture can save
power and silicon area with a minimal impact on the
overall performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zmily:2006:BAI,
author = "Ahmad Zmily and Christos Kozyrakis",
title = "Block-aware instruction set architecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "327--357",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162694",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Instruction delivery is a critical component for
wide-issue, high-frequency processors since its
bandwidth and accuracy place an upper limit on
performance. The processor front-end accuracy and
bandwidth are limited by instruction-cache misses,
multicycle instruction-cache accesses, and target or
direction mispredictions for control-flow operations.
This paper presents a block-aware instruction set
(BLISS) that allows software to assist with front-end
challenges. BLISS defines basic block descriptors that
are stored separately from the actual instructions in a
program. We show that BLISS allows for a decoupled
front-end that tolerates instruction-cache latency,
facilitates instruction prefetching, and leads to
higher prediction accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Crandall:2006:MAS,
author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
Chong",
title = "{Minos}: {Architectural} support for protecting
control data",
journal = j-TACO,
volume = "3",
number = "4",
pages = "359--389",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Marathe:2006:ACC,
author = "Jaydeep Marathe and Frank Mueller and Bronis R. de
Supinski",
title = "Analysis of cache-coherence bottlenecks with hybrid
hardware\slash software techniques",
journal = j-TACO,
volume = "3",
number = "4",
pages = "390--423",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ganusov:2006:FEP,
author = "Ilya Ganusov and Martin Burtscher",
title = "Future execution: a prefetching mechanism that uses
multiple cores to speed up single threads",
journal = j-TACO,
volume = "3",
number = "4",
pages = "424--449",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Co:2006:ETC,
author = "Michele Co and Dee A. B. Weikle and Kevin Skadron",
title = "Evaluating trace cache energy efficiency",
journal = j-TACO,
volume = "3",
number = "4",
pages = "450--476",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hu:2006:EMM,
author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
title = "Effective management of multiple configurable units
using dynamic optimization",
journal = j-TACO,
volume = "3",
number = "4",
pages = "477--501",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bentley:2006:IAB,
author = "Chris Bentley and Scott A. Watterson and David K.
Lowenthal and Barry Rountree",
title = "Implicit array bounds checking on 64-bit
architectures",
journal = j-TACO,
volume = "3",
number = "4",
pages = "502--527",
month = dec,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1187976.1187982",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Several programming languages guarantee that array
subscripts are checked to ensure they are within the
bounds of the array. While this guarantee improves the
correctness and security of array-based code, it adds
overhead to array references. This has been an obstacle
to using higher-level languages, such as Java, for
high-performance parallel computing, where the language
specification requires that all array accesses must be
checked to ensure they are within bounds. This is
because, in practice, array-bounds checking in
scientific applications may increase execution time by
more than a factor of 2. Previous research has explored
optimizations to statically eliminate bounds checks,
but the dynamic nature of many scientific codes makes
this difficult or impossible. Our approach is, instead,
to create a compiler and operating system
infrastructure that does not generate explicit bounds
checks. It instead places arrays inside of Index
Confinement Regions (ICRs), which are large, isolated,
mostly unmapped virtual memory regions. Any array
reference outside of its bounds will cause a protection
violation; this provides implicit bounds checking. Our
results show that when applying this infrastructure to
high-performance computing programs written in Java,
the overhead of bounds checking relative to a program
with no bounds checks is reduced from an average of
63\% to an average of 9\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Calder:2007:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "4",
number = "1",
pages = "1:1--1:1",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Constantinides:2007:ARC,
author = "Kypros Constantinides and Stephen Plaza and Jason
Blome and Valeria Bertacco and Scott Mahlke and Todd
Austin and Bin Zhang and Michael Orshansky",
title = "Architecting a reliable {CMP} switch architecture",
journal = j-TACO,
volume = "4",
number = "1",
pages = "2:1--2:37",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sasanka:2007:AES,
author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
Yen-Kuang Chen and Eric Debes",
title = "{ALP}: {Efficient} support for all levels of
parallelism for complex media applications",
journal = j-TACO,
volume = "4",
number = "1",
pages = "3:1--3:30",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2007:CNP,
author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
title = "Conserving network processor power consumption by
exploiting traffic variability",
journal = j-TACO,
volume = "4",
number = "1",
pages = "4:1--4:26",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Soteriou:2007:SDP,
author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
title = "Software-directed power-aware interconnection
networks",
journal = j-TACO,
volume = "4",
number = "1",
pages = "5:1--5:40",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hwang:2007:SSA,
author = "Yuan-Shin Hwang and Jia-Jhe Li",
title = "Snug set-associative caches: Reducing leakage power of
instruction and data caches with no performance
penalties",
journal = j-TACO,
volume = "4",
number = "1",
pages = "6:1--6:28",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rong:2007:SDS,
author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
Alban Douillet and Guang R. Gao",
title = "Single-dimension software pipelining for
multidimensional loops",
journal = j-TACO,
volume = "4",
number = "1",
pages = "7:1--7:44",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bower:2007:ODH,
author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
title = "Online diagnosis of hard faults in microprocessors",
journal = j-TACO,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250728",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We develop a microprocessor design that tolerates hard
faults, including fabrication defects and in-field
faults, by leveraging existing microprocessor
redundancy. To do this, we must: detect and correct
errors, diagnose hard faults at the field
deconfigurable unit (FDU) granularity, and deconfigure
FDUs with hard faults. In our reliable microprocessor
design, we use DIVA dynamic verification to detect and
correct errors. Our new scheme for diagnosing hard
faults tracks instructions' core structure occupancy
from decode until commit. If a DIVA checker detects an
error in an instruction, it increments a small
saturating error counter for every FDU used by that
instruction, including that DIVA checker. A hard fault
in an FDU quickly leads to an above-threshold error
counter for that FDU and thus diagnoses the fault. For
deconfiguration, we use previously developed schemes
for functional units and buffers and present a scheme
for deconfiguring DIVA checkers. Experimental results
show that our reliable microprocessor quickly and
accurately diagnoses each hard fault that is injected
and continues to function, albeit with somewhat
degraded performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "fine-grained diagnosis; hard fault tolerance;
processor microarchitecture",
}
@Article{Michaud:2007:STM,
author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
and Yiannakis Sazeides and Theofanis Constantinou",
title = "A study of thread migration in temperature-constrained
multicores",
journal = j-TACO,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250729",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Temperature has become an important constraint in
high-performance processors, especially multicores.
Thread migration will be essential to exploit the full
potential of future thermally constrained multicores.
We propose and study a thread migration method that
maximizes performance under a temperature constraint,
while minimizing the number of migrations and ensuring
fairness between threads. We show that thread migration
brings important performance gains and that it is most
effective during the first tens of seconds following a
decrease of the number of running threads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "multicore processor; power density; temperature;
thermal management; thread migration",
}
@Article{Chen:2007:CRL,
author = "Yu Chen and Fuxin Zhang",
title = "Code reordering on limited branch offset",
journal = j-TACO,
volume = "4",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250730",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Since the 1980's code reordering has gained popularity
as an important way to improve the spatial locality of
programs. While the effect of the processor's
microarchitecture and memory hierarchy on this
optimization technique has been investigated, little
research has focused on the impact of the instruction
set. In this paper, we analyze the effect of limited
branch offset of the MIPS-like instruction set [Hwu et
al. 2004, 2005] on code reordering, explore two simple
methods to handle the exceeded branches, and propose
the bidirectional code layout (BCL) algorithm to reduce
the number of branches exceeding the offset limit. The
BCL algorithm sorts the chains according to the
position of related chains, avoids cache conflict
misses deliberately and lays out the code
bidirectionally. It strikes a balance among the
distance of related blocks, the instruction cache miss
rate, the memory size required, and the control flow
transfer. Experimental results show that BCL can
effectively reduce exceeded branches by 50.1\%, on
average, with up to 100\% for some programs. Except for
some programs with little spatial locality, the BCL
algorithm can achieve the performance, as the case with
no branch offset limitation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "code reordering; Godson Processor; link-time
optimization",
}
@Article{Terechko:2007:ICC,
author = "A. S. Terechko and H. Corporaal",
title = "Inter-cluster communication in {VLIW} architectures",
journal = j-TACO,
volume = "4",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250731",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The traditional VLIW (very long instruction word)
architecture with a single register file does not scale
up well to address growing performance demands on
embedded media processors. However, splitting a VLIW
processor in smaller clusters, which are comprised of
function units fully connected to local register files,
can significantly improve VLSI implementation
characteristics of the processor, such as speed, energy
consumption, and area. In our paper we reveal that
achieving the best characteristics of a clustered VLIW
requires a thorough selection of an Inter-cluster
Communication (ICC) model, which is the way clustering
is exposed in the Instruction Set Architecture. For our
study we, first, define a taxonomy of ICC models
including copy operations, dedicated issue slots,
extended operands, extended results, and multicast.
Evaluation of the execution time of the models requires
both the dynamic cycle count and clock period. We
developed an advanced instruction scheduler for all the
five ICC models in order to quantify the dynamic cycle
counts of our multimedia C benchmarks. To assess the
clock period of the ICC models we designed and laid out
VLIW datapaths using the RTL hardware descriptions
derived from a deeply pipelined commercial TriMedia
processor. In contrast to prior art, our research shows
that fully distributed register file architectures
(with eight clusters in our study) often underperform
compared to moderately clustered machines with two or
four clusters because of explosion of the cycle count
overhead in the former. Among the evaluated ICC models,
performance of the copy operation model, popular both
in academia and industry, is severely limited by the
copy operations hampering scheduling of regular
operations in high ILP (instruction-level parallelism)
code. The dedicated issue slots model combats this
limitation by dedicating extra VLIW issue slots purely
for ICC, reaching the highest 1.74 execution time
speedup relative to the unicluster. Furthermore, our
VLSI experiments show that the lowest area and energy
consumption of 42 and 57\% relative to the unicluster,
respectively, are achieved by the extended operands
model, which, nevertheless, provides higher performance
than the copy operation model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "clock frequency; cluster assignment; instruction
scheduler; instruction-level parallelism; intercluster
communication; optimizing compiler; pipelining;
register allocation; VLIW",
}
@Article{Dou:2007:CCM,
author = "Jialin Dou and Marcelo Cintra",
title = "A compiler cost model for speculative
parallelization",
journal = j-TACO,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250732",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Speculative parallelization is a technique that allows
code sections that cannot be fully analyzed by the
compiler to be aggressively executed in parallel.
However, while speculative parallelization can
potentially deliver significant speedups, several
overheads associated with this technique can limit
these speedups in practice. This paper proposes a novel
compiler static cost model of speculative multithreaded
execution that can be used to predict the resulting
performance. This model attempts to predict the
expected speedups, or slowdowns, of the candidate
speculative sections based on the estimation of the
combined runtime effects of various overheads, and
taking into account the scheduling restrictions of most
speculative execution environments. The model is based
on estimating the likely execution duration of threads
and considers all the possible permutations of these
threads. This model also produces a quantitative
estimate of the speedup, which is different from prior
heuristics that only qualitatively estimate the
benefits of speculative multithreaded execution. In
previous work, a limited version of the framework was
evaluated on a number of loops from a collection of
SPEC benchmarks that suffer mainly from load imbalance
and thread dispatch and commit overheads. In this work,
an extended framework is also evaluated on loops that
may suffer from data-dependence violations.
Experimental results show that prediction accuracy is
lower when loops with violations are included.
Nevertheless, accuracy is still very high for a static
model: the framework can identify, on average, 45\% of
the loops that cause slowdowns and, on average, 96\% of
the loops that lead to speedups; it predicts the
speedups or slowdowns with an error of less than 20\%
for an average of 28\% of the loops across the
benchmarks and with an error of less than 50\% for an
average of 80\% of the loops. Overall, the framework
often outperforms, by as much as 25\%, a naive approach
that attempts to speculatively parallelize all the
loops considered, and is able to curb the large
slowdowns caused in many cases by this naive
approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "speculative multithreading; speculative
parallelization; thread-level speculation",
}
@Article{Amme:2007:SBM,
author = "Wolfram Amme and Jeffery von Ronne and Michael Franz",
title = "{SSA}-based mobile code: {Implementation} and
empirical evaluation",
journal = j-TACO,
volume = "4",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250733",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although one might expect transportation formats based
on static single-assignment form (SSA) to yield faster
just-in-time compilation times than those based on
stack-based virtual machines, this claim has not
previously been validated, in practice. We attempt to
quantify the effect of using an SSA-based mobile code
representation by integrating support for a verifiable
SSA-based IR into Jikes RVM. Performance results,
measured with various optimizations and on both the
IA32 and PowerPC, show improvements in both compilation
time and code quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "SafeTSA; static single-assignment form; virtual
machines",
}
@Article{Li:2007:CCE,
author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
Yuanyuan Zhou",
title = "Cross-component energy management: {Joint} adaptation
of processor and memory",
journal = j-TACO,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275938",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Researchers have proposed the use of adaptation to
reduce the energy consumption of different hardware
components, such as the processor, memory, disk, and
display for general-purpose applications. Previous
algorithms to control these adaptations, however, have
focused on a single component. This work takes the
first step toward developing algorithms that can
jointly control adaptations in multiple interacting
components for general-purpose applications, with the
goal of minimizing the total energy consumed within a
specified performance loss. Specifically, we develop a
joint-adaptation algorithm for processor and memory
adaptations. We identify two properties that enable
per-component algorithms to be easily used in a
cross-component context---the algorithms' performance
impact must be guaranteed and composable. We then
modify a current processor and a memory algorithm to
obey these properties. This allows the cross-component
problem to be reduced to determine an appropriate
(energy-optimal) allocation of the target performance
loss (slack) between the two components. We develop
such an optimal slack allocation algorithm that
exploits the above properties. The result is an
efficient cross-component adaptation framework that
minimizes the total energy of the processor and memory
without exceeding the target performance loss, while
substantially leveraging current per-component
algorithms. Our experiments show that joint processor
and memory adaptation provides significantly more
energy savings than adapting either component alone;
intelligent slack distribution is specifically
effective for highly compute- or memory-intensive
applications; and the performance slowdown never
exceeds the specification.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "adaptive systems; control algorithms; energy
management; low-power design; memory; performance
guarantee; processor",
}
@Article{Gabor:2007:FES,
author = "Ron Gabor and Shlomo Weiss and Avi Mendelson",
title = "Fairness enforcement in switch on event
multithreading",
journal = j-TACO,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275939",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The need to reduce power and complexity will increase
the interest in Switch On Event multithreading
(coarse-grained multithreading). Switch On Event
multithreading is a low-power and low-complexity
mechanism to improve processor throughput by switching
threads on execution stalls. Fairness may, however,
become a problem in a multithreaded processor. Unless
fairness is properly handled, some threads may starve
while others consume all of the processor cycles.
Heuristics that were devised in order to improve
fairness in simultaneous multithreading are not
applicable to Switch On Event multithreading. This
paper defines the fairness metric using the ratio of
the individual threads' speedups and shows how it can
be enforced in Switch On Event multithreading. Fairness
is controlled by forcing additional thread switch
points. These switch points are determined dynamically
by runtime estimation of the single threaded
performance of each of the individual threads. We
analyze the impact of the fairness enforcement
mechanism on aggregate IPC and weighted speedup. We
present simulation results of the performance of Switch
On Event multithreading. Switch On Event multithreading
achieves an average aggregate IPC increase of 26\% over
single thread and 12\% weighted speedup when no
fairness is enforced. In this case, a sixth of our runs
resulted in poor fairness in which one thread ran
extremely slowly (10 to 100 times slower than its
single-thread performance), while the other thread's
performance was hardly affected. By using the proposed
mechanism, we can guarantee fairness at different
levels of strictness and, in most cases, even improve
the weighted speedup.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "coarse-grained multithreading; fairness;
multithreading; performance; SOE; Switch on Event
multithreading; throughput; weighted speedup",
}
@Article{Andrade:2007:PAA,
author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Precise automatable analytical modeling of the cache
behavior of codes with indirections",
journal = j-TACO,
volume = "4",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275940",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The performance of memory hierarchies, in which caches
play an essential role, is critical in nowadays
general-purpose and embedded computing systems because
of the growing memory bottleneck problem.
Unfortunately, cache behavior is very unstable and
difficult to predict. This is particularly true in the
presence of irregular access patterns, which exhibit
little locality. Such patterns are very common, for
example, in applications in which pointers or
compressed sparse matrices give place to indirections.
Nevertheless, cache behavior in the presence of
irregular access patterns has not been widely studied.
In this paper we present an extension of a systematic
analytical modeling technique based on PMEs
(probabilistic miss equations), previously developed by
the authors, that allows the automated analysis of the
cache behavior for codes with irregular access patterns
resulting from indirections. The model generates very
accurate predictions despite the irregularities and has
very low computing requirements, being the first model
that gathers these desirable characteristics that can
automatically analyze this kind of codes. These
properties enable this model to help drive compiler
optimizations, as we show with an example.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "analytical modeling; irregular access patterns; memory
hierarchy; performance prediction",
}
@Article{Venstermans:2007:JOH,
author = "Kris Venstermans and Lieven Eeckhout and Koen {De
Bosschere}",
title = "{Java} object header elimination for reduced memory
consumption in 64-bit virtual machines",
journal = j-TACO,
volume = "4",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275941",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory performance is an important design issue for
contemporary computer systems given the huge
processor/memory speed gap. This paper proposes a
space-efficient Java object model for reducing the
memory consumption of 64-bit Java virtual machines. We
completely eliminate the object header through typed
virtual addressing (TVA) or implicit typing. TVA
encodes the object type in the object's virtual address
by allocating all objects of a given type in a
contiguous memory segment. This allows for removing the
type information as well as the status field from the
object header. Whenever type and status information is
needed, masking is applied to the object's virtual
address for obtaining an offset into type and status
information structures. Unlike previous work on
implicit typing, we apply TVA to a selected number of
frequently allocated object types, hence, the name
selective TVA (STVA); this limits the amount of memory
fragmentation. In addition to applying STVA, we also
compress the type information block (TIB) pointers for
all objects that do not fall under TVA. We implement
the space-efficient Java object model in the 64-bit
version of the Jikes RVM on an AIX IBM platform and
compare its performance against the traditionally used
Java object model using a multitude of Java benchmarks.
We conclude that the space-efficient Java object model
reduces memory consumption by on average 15\% (and up
to 45\% for some benchmarks). About one-half the
reduction comes from TIB pointer compression; the other
one-half comes from STVA. In terms of performance, the
space-efficient object model generally does not affect
performance; however, for some benchmarks we observe
statistically significant performance speedups, up to
20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "64-bit implementation; implicit typing; Java object
model; typed virtual addressing; Virtual machine",
}
@Article{Xiao:2007:VIS,
author = "Shu Xiao and Edmund M.-K. Lai",
title = "{VLIW} instruction scheduling for minimal power
variation",
journal = j-TACO,
volume = "4",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275942",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The focus of this paper is on the minimization of the
variation in power consumed by a VLIW processor during
the execution of a target program through instruction
scheduling. The problem is formulated as a
mixed-integer program (MIP) and a problem-specific
branch-and-bound algorithm has been developed to solve
it more efficiently than generic MIP solvers.
Simulation results based on the TMS320C6711 VLIW
digital signal processor using benchmarks from
Mediabench and Trimaran showed that over 40\% average
reduction in power variation can be achieved without
sacrificing execution speed of these benchmarks.
Computational requirements and convergence rates of our
algorithm are also analyzed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "instruction scheduling; power variation reduction;
VLIW processors",
}
@Article{Tallam:2007:UCF,
author = "Sriraman Tallam and Rajiv Gupta",
title = "Unified control flow and data dependence traces",
journal = j-TACO,
volume = "4",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275943",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We describe the design, generation, and compression of
the extended whole program path (eWPP), representation
that not only captures the control flow history of a
program execution but also its data dependence history.
This representation is motivated by the observation
that, typically, a significant fraction of data
dependence history can be recovered from the control
flow trace. To capture the remainder of the data
dependence history, we introduce disambiguation checks
in the program whose control flow signatures capture
the results of the checks. The resulting extended
control flow trace enables the recovery of otherwise
irrecoverable data dependences. The code for the checks
is designed to minimize the increase in program
execution time and the extended control flow trace size
when compared to directly collecting control flow and
address traces. Our experiments show that compressed
eWPPs are only one-quarter of the size of combined
compressed control flow and address traces. However,
their collection incurs a 5{\times} increase in runtime
overhead relative to the overhead required for directly
collecting the control flow and address traces,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "address trace; control flow trace; dynamic data
dependence trace; profiling",
}
@Article{Ipek:2008:EAD,
author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich
Caruana and Bronis R. de Supinski and Martin Schulz",
title = "Efficient architectural design space exploration via
predictive modeling",
journal = j-TACO,
volume = "4",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328196",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Efficiently exploring exponential-size architectural
design spaces with many interacting parameters remains
an open problem: the sheer number of experiments
required renders detailed simulation intractable. We
attack this via an automated approach that builds
accurate predictive models. We simulate sampled points,
using results to teach our models the function
describing relationships among design parameters. The
models can be queried and are very fast, enabling
efficient design tradeoff discovery. We validate our
approach via two uniprocessor sensitivity studies,
predicting IPC with only 1--2\% error. In an
experimental study using the approach, training on 1\%
of a 250-K-point CMP design space allows our models to
predict performance with only 4--5\% error. Our
predictive modeling combines well with techniques that
reduce the time taken by each simulation experiment,
achieving net time savings of three-four orders of
magnitude.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "artificial neural networks; design space exploration;
performance prediction; sensitivity studies",
}
@Article{Shi:2008:VMS,
author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
Gregg",
title = "Virtual machine showdown: {Stack} versus registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328197",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Virtual machines (VMs) enable the distribution of
programs in an architecture-neutral format, which can
easily be interpreted or compiled. A long-running
question in the design of VMs is whether a stack
architecture or register architecture can be
implemented more efficiently with an interpreter. We
extend existing work on comparing virtual stack and
virtual register architectures in three ways. First,
our translation from stack to register code and
optimization are much more sophisticated. The result is
that we eliminate an average of more than 46\% of
executed VM instructions, with the bytecode size of the
register machine being only 26\% larger than that of
the corresponding stack one. Second, we present a fully
functional virtual-register implementation of the Java
virtual machine (JVM), which supports Intel, AMD64,
PowerPC and Alpha processors. This register VM supports
inline-threaded, direct-threaded, token-threaded, and
switch dispatch. Third, we present experimental results
on a range of additional optimizations such as register
allocation and elimination of redundant heap loads. On
the AMD64 architecture the register machine using
switch dispatch achieves an average speedup of 1.48
over the corresponding stack machine. Even using the
more efficient inline-threaded dispatch, the register
VM achieves a speedup of 1.15 over the equivalent
stack-based VM.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "interpreter; register architecture; stack
architecture; virtual machine",
}
@Article{Yan:2008:EVR,
author = "Jun Yan and Wei Zhang",
title = "Exploiting virtual registers to reduce pressure on
real registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328198",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "It is well known that a large fraction of variables
are short-lived. This paper proposes a novel approach
to exploiting this fact to reduce the register pressure
for pipelined processors with data-forwarding network.
The idea is that the compiler can allocate virtual
registers (i.e., place holders to identify dependences
among instructions) to short-lived variables, which do
not need to be stored to physical storage locations. As
a result, real registers (i.e., physically existed
registers) can be reserved for long-lived variables for
mitigating the register pressure and decreasing the
register spills, leading to performance improvement. In
this paper, we develop the architectural and compiler
support for exploiting virtual registers for statically
scheduled processors. Our experimental results show
that virtual registers are very effective at reducing
the register spills, which, in many cases, can achieve
the performance close to the processor with twice
number of real registers. Our results also indicate
that, for some applications, using 24 virtual, in
addition to 8 real registers, can attain even higher
performance than that of 16 real without any virtual
registers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "data forwarding; register allocation; register file;
short-lived variables; virtual register",
}
@Article{Yu:2008:OCL,
author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
title = "Object co-location and memory reuse for {Java}
programs",
journal = j-TACO,
volume = "4",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328199",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We introduce a new memory management system, STEMA,
which can improve the execution time of Java programs.
STEMA detects prolific types on-the-fly and co-locates
their objects in a special memory space which supports
reuse of memory. We argue and show that memory reuse
and co-location of prolific objects can result in
improved cache locality, reduced memory fragmentation,
reduced GC time, and faster object allocation. We
evaluate STEMA using 16 benchmarks. Experimental
results show that STEMA performs 2.7\%, 4.0\%, and
8.2\% on average better than MarkSweep, CopyMS, and
SemiSpace.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "garbage collector; Java; memory allocator; memory
reuse; mutator; object co-location",
}
@Article{Zhang:2008:RCM,
author = "Chuanjun Zhang",
title = "Reducing cache misses through programmable decoders",
journal = j-TACO,
volume = "4",
number = "4",
pages = "5:1--5:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328200",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Level-one caches normally reside on a processor's
critical path, which determines clock frequency.
Therefore, fast access to level-one cache is important.
Direct-mapped caches exhibit faster access time, but
poor hit rates, compared with same sized
set-associative caches because of nonuniform accesses
to the cache sets. The nonuniform accesses generate
more cache misses in some sets, while other sets are
underutilized. We propose to increase the decoder
length and, hence, reduce the accesses to heavily used
sets without dynamically detecting the cache set usage
information. We increase the access to the
underutilized cache sets by incorporating a replacement
policy into the cache design using programmable
decoders. On average, the proposed techniques achieve
as low a miss rate as a traditional 4-way cache on all
26 SPEC2K benchmarks for the instruction and data
caches, respectively. This translates into an average
IPC improvement of 21.5 and 42.4\% for SPEC2K integer
and floating-point benchmarks, respectively. The
B-Cache consumes 10.5\% more power per access, but
exhibits a 12\% total memory access-related energy
savings as a result of the miss rate reductions, and,
hence, the reduction to applications' execution time.
Compared with previous techniques that aim at reducing
the miss rate of direct-mapped caches, our technique
requires only one cycle to access all cache hits and
has the same access time of a direct-mapped cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache; dynamic optimization; low power",
}
@Article{Golander:2008:HMP,
author = "Amit Golander and Shlomo Weiss",
title = "Hiding the misprediction penalty of a
resource-efficient high-performance processor",
journal = j-TACO,
volume = "4",
number = "4",
pages = "6:1--6:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328201",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Misprediction is a major obstacle for increasing
speculative out-of-order processors performance.
Performance degradation depends on both the number of
misprediction events and the recovery time associated
with each one of them. In recent years a few checkpoint
based microarchitectures have been proposed. In
comparison with ROB-based processors, checkpoint
processors are scalable and highly resource efficient.
Unfortunately, in these proposals the misprediction
recovery time is proportional to the instruction queue
size.\par
In this paper we analyze methods to reduce the
misprediction recovery time. We propose a new register
file management scheme and techniques to selectively
flush the instruction queue and the load store queue,
and to isolate deeply pipelined execution units. The
result is a novel checkpoint processor with Constant
misprediction RollBack time (CRB). We further present a
streamlined, cost-efficient solution, which saves
complexity at the price of slightly lower
performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "checkpoints; misprediction; out-of-order execution;
rollback; scalable architecture",
}
@Article{Calder:2008:E,
author = "Brad Calder and Dean Tullsen",
title = "Editorial",
journal = j-TACO,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369397",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mysore:2008:FIP,
author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
and Timothy Sherwood and Nisheeth Shrivastava and
Subhash Suri",
title = "Formulating and implementing profiling over adaptive
ranges",
journal = j-TACO,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369398",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern computer systems are called on to deal with
billions of events every second, whether they are
executed instructions, accessed memory locations, or
forwarded packets. This presents a serious challenge to
those who seek to quantify, analyze, or optimize such
systems, because important trends and behaviors may
easily be lost in a sea of data. We present
range-adaptive profiling (RAP) as a new and
general-purpose profiling method capable of
hierarchically efficiently classifying streams of data
in hardware. Through the use of RAP, events in an input
stream are dynamically classified into increasingly
precise categories, based on the frequency with which
they occur. The more important a class, or range of
events, the more precisely it is quantified. Despite
the dynamic nature of our technique, we build upon
tight theoretic bounds covering both worst-case error,
as well as the required memory. In the limit, it is
known that error and the memory bounds can be
independent of the stream size and grow only linearly
with the level of precision desired. Significantly, we
expose the critical constants in these algorithms and
through careful engineering, algorithm redesign, and
use of heuristics, we show how a high-performance
profile system can be implemented for range-adaptive
profiling. RAP can be used on various profiles, such as
PCs, load values, and memory addresses, and has a broad
range of uses, from hot-region profiling to quantifying
cache miss value locality. We propose two methods of
implementation of RAP, one in software and the other
with specialized hardware, for which we also describe
our prototype FPGA implementation. We show that with
just 8KB of memory, range profiles can be gathered with
an average accuracy of 98\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "profiling hardware; range adaptive; value locality",
}
@Article{Zhai:2008:CHS,
author = "Antonia Zhai and J. Gregory Steffan and Christopher B.
Colohan and Todd C. Mowry",
title = "Compiler and hardware support for reducing the
synchronization of speculative threads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369399",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Thread-level speculation (TLS) allows us to
automatically parallelize general-purpose programs by
supporting parallel execution of threads that might not
actually be independent. In this article, we focus on
one important limitation of program performance under
TLS, which stalls as a result of synchronizing and
forwarding scalar values between speculative threads
that would otherwise cause frequent data dependences
and, hence, failed speculation. Using SPECint
benchmarks that have been automatically transformed by
our compiler to exploit TLS, we present, evaluate in
detail, and compare both compiler and hardware
techniques for improving the communication of scalar
values. We find that through our dataflow algorithms
for three increasingly aggressive instruction
scheduling techniques, the compiler can drastically
reduce the critical forwarding path introduced by the
synchronization and forwarding of scalar values. We
also show that hardware techniques for reducing
synchronization can be complementary to compiler
scheduling, but that the additional performance
benefits are minimal and are generally not worth the
cost.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "automatic parallelization; chip-multiprocessing;
instruction scheduling; thread-level speculation",
}
@Article{Winter:2008:ATN,
author = "Jonathan A. Winter and David H. Albonesi",
title = "Addressing thermal nonuniformity in {SMT} workloads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369400",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We explore DTM techniques within the context of
uniform and nonuniform SMT workloads. While DVS is
suitable for addressing workloads with uniformly high
temperatures, for nonuniform workloads, performance
loss occurs because of the slowdown of the cooler
thread. To address this, we propose and evaluate DTM
mechanisms that exploit the steering-based thread
management mechanisms inherent in a clustered SMT
architecture. We show that in contrast to DVS, which
operates globally, our techniques are more effective at
controlling temperature for nonuniform workloads.
Furthermore, we devise a DTM technique that combines
steering and DVS to achieve consistently good
performance across all workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "adaptive microarchitectures; clustered
microarchitectures; dynamic thermal management; dynamic
voltage scaling; simultaneous multithreading",
}
@Article{Shahbahrami:2008:VES,
author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis
Vassiliadis",
title = "Versatility of extended subwords and the matrix
register file",
journal = j-TACO,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369401",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Extended subwords and the matrix register file (MRF)
are two micro architectural techniques that address
some of the limitations of existing SIMD architectures.
Extended subwords are wider than the data stored in
memory. Specifically, for every byte of data stored in
memory, there are four extra bits in the media register
file. This avoids the need for data-type conversion
instructions. The MRF is a register file organization
that provides both conventional row-wise, as well as
column-wise, access to the register file. In other
words, it allows to view the register file as a matrix
in which corresponding subwords in different registers
corresponds to a column of the matrix. It was
introduced to accelerate matrix transposition which is
a very common operation in multimedia applications. In
this paper, we show that the MRF is very versatile,
since it can also be used for other permutations than
matrix transposition. Specifically, it is shown how it
can be used to provide efficient access to strided
data, as is needed in, e.g., color space conversion.
Furthermore, it is shown that special-purpose
instructions (SPIs), such as the sum-of-absolute
differences (SAD) instruction, have limited usefulness
when extended subwords and a few general SIMD
instructions that we propose are supported, for the
following reasons. First, when extended subwords are
supported, the SAD instruction provides only a
relatively small performance improvement. Second, the
SAD instruction processes 8-bit subwords only, which is
not sufficient for quarter-pixel resolution nor for
cost functions used in image and video retrieval.
Results obtained by extending the SimpleScalar toolset
show that the proposed techniques provide a speedup of
up to 3.00 over the MMX architecture. The results also
show that using, at most, 13 extra media registers
yields an additional performance improvement ranging
from 1.3 to 1.57.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "multimedia standards; SIMD architectures; SIMD
programming",
}
@Article{Guo:2008:EHC,
author = "Zhi Guo and Walid Najjar and Betul Buyukkurt",
title = "Efficient hardware code generation for {FPGAs}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369402",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The wider acceptance of FPGAs as a computing device
requires a higher level of programming abstraction.
ROCCC is an optimizing C to HDL compiler. We describe
the code generation approach in ROCCC. The smart buffer
is a component that reuses input data between adjacent
iterations. It significantly improves the performance
of the circuit and simplifies loop control. The
ROCCC-generated datapath can execute one loop iteration
per clock cycle when there is no loop dependency or
there is only scalar recurrence variable dependency.
ROCCC's approach to supporting while-loops operating on
scalars makes the compiler able to move scalar
iterative computation into hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "data reuse; FPGA; high-level synthesis; reconfigurable
computing; VHDL",
}
@Article{Kotzmann:2008:DJH,
author = "Thomas Kotzmann and Christian Wimmer and Hanspeter
M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
Russell and David Cox",
title = "Design of the {Java HotSpot\TM} client compiler for
{Java 6}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "7:1--7:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1370017",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
ships with a redesigned version of the client
just-in-time compiler that includes several research
results of the last years. The client compiler is at
the heart of the VM configuration used by default for
interactive desktop applications. For such
applications, low startup and pause times are more
important than peak performance. This paper outlines
the new architecture of the client compiler and shows
how it interacts with the VM. It presents the
intermediate representation that now uses static
single-assignment (SSA) form and the linear scan
algorithm for global register allocation. Efficient
support for exception handling and deoptimization
fulfills the demands that are imposed by the dynamic
features of the Java programming language. The
evaluation shows that the new client compiler generates
better code in less time. The popular SPECjvm98
benchmark suite is executed 45\% faster, while the
compilation speed is also up to 40\% better. This
indicates that a carefully selected set of global
optimizations can also be integrated in just-in-time
compilers that focus on compilation speed and not on
peak performance. In addition, the paper presents the
impact of several optimizations on execution and
compilation speed. As the source code is freely
available, the Java HotSpot{\TM} VM and the client
compiler are the ideal basis for experiments with new
feedback-directed optimizations in a production-level
Java just-in-time compiler. The paper outlines research
projects that add fast algorithms for escape analysis,
automatic object inlining, and array bounds check
elimination.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "compiler; deoptimization; intermediate representation;
Java; just-in-time compilation; optimization; register
allocation",
}
@Article{Rangan:2008:PSD,
author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
and David I. August",
title = "Performance scalability of decoupled software
pipelining",
journal = j-TACO,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400113",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Any successful solution to using multicore processors
to scale general-purpose program performance will have
to contend with rising intercore communication costs
while exposing coarse-grained parallelism. Recently
proposed pipelined multithreading (PMT) techniques have
been demonstrated to have general-purpose applicability
and are also able to effectively tolerate inter-core
latencies through pipelined interthread communication.
These desirable properties make PMT techniques strong
candidates for program parallelization on current and
future multicore processors and understanding their
performance characteristics is critical to their
deployment. To that end, this paper evaluates the
performance scalability of a general-purpose PMT
technique called decoupled software pipelining (DSWP)
and presents a thorough analysis of the communication
bottlenecks that must be overcome for optimal DSWP
scalability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "decoupled software pipelining; performance analysis",
}
@Article{Long:2008:TMM,
author = "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and
Rajarshi Mukherjee",
title = "Thermal monitoring mechanisms for chip
multiprocessors",
journal = j-TACO,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400114",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With large-scale integration and increasing power
densities, thermal management has become an important
tool to maintain performance and reliability in modern
process technologies. In the core of dynamic thermal
management schemes lies accurate reading of on-die
temperatures. Therefore, careful planning and embedding
of thermal monitoring mechanisms into high-performance
systems becomes crucial. In this paper, we propose
three techniques to create sensor infrastructures for
monitoring the maximum temperature on a multicore
system. Initially, we extend a nonuniform sensor
placement methodology proposed in the literature to
handle chip multiprocessors (CMPs) and show its
limitations. We then analyze a grid-based approach
where the sensors are placed on a static grid covering
each core and show that the sensor readings can differ
from the actual maximum core temperature by as much as
12.6^\circ C when using 16 sensors per core. Also, as
large as 10.6\% of the thermal emergencies are not
captured using the same number of sensors. Based on
this observation, we first develop an interpolation
scheme, which estimates the maximum core temperature
through interpolation of the readings collected at the
static grid points. We show that the interpolation
scheme improves the measurement accuracy and emergency
coverage compared to grid-based placement when using
the same number of sensors. Second, we present a
dynamic scheme where only a subset of the sensor
readings is collected to predict the maximum
temperature of each core. Our results indicate that, we
can reduce the number of active sensors by as much as
50\%, while maintaining similar measurement accuracy
and emergency coverage compared to the case where the
entire sensor set on the grid is sampled at all
times.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "nonuniform and uniform sensor placement; thermal
sensor allocation",
}
@Article{Joshi:2008:DEP,
author = "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell,
Jr.} and Lizy K. John",
title = "Distilling the essence of proprietary workloads into
miniature benchmarks",
journal = j-TACO,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400115",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Benchmarks set standards for innovation in computer
architecture research and industry product development.
Consequently, it is of paramount importance that these
workloads are representative of real-world
applications. However, composing such representative
workloads poses practical challenges to application
analysis teams and benchmark developers (1) real-world
workloads are intellectual property and vendors
hesitate to share these proprietary applications; and
(2) porting and reducing these applications to
benchmarks that can be simulated in a tractable amount
of time is a nontrivial task. In this paper, we address
this problem by proposing a technique that
automatically distills key inherent behavioral
attributes of a proprietary workload and captures them
into a miniature synthetic benchmark clone. The
advantage of the benchmark clone is that it hides the
functional meaning of the code but exhibits similar
performance characteristics as the target application.
Moreover, the dynamic instruction count of the
synthetic benchmark clone is substantially shorter than
the proprietary application, greatly reducing overall
simulation time for SPEC CPU, the simulation time
reduction is over five orders of magnitude compared to
entire benchmark execution. Using a set of benchmarks
representative of general-purpose, scientific, and
embedded applications, we demonstrate that the power
and performance characteristics of the synthetic
benchmark clone correlate well with those of the
original application across a wide range of
microarchitecture configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "benchmark cloning; benchmarks; workload
characterization",
}
@Article{Catania:2008:RCM,
author = "Vincenzo Catania and Maurizio Palesi and Davide
Patti",
title = "Reducing complexity of multiobjective design space
exploration in {VLIW}-based embedded systems",
journal = j-TACO,
volume = "5",
number = "2",
pages = "11:1--11:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400116",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Architectures based on very-long instruction word
(VLIW) have found fertile ground in multimedia
electronic appliances thanks to their ability to
exploit high degrees of instruction level parallelism
(ILP) with a reasonable trade-off in complexity and
silicon cost. Specialization of such architectures
involves the configuration of both hardware-related
aspects (e.g., register files, functional units, memory
subsystem) and software-related issues (e.g., the
compilation strategy). The complex interactions between
the components of such systems will force a human
designer to rely on judgment and experience in
designing them, possibly eliminating interesting
configurations, and making tuning of the system, for
either power, energy, or performance, difficult. In
this paper we propose tools and methodologies to
efficiently cope with this complexity from a
multiobjective perspective. We first analyze the impact
of ILP-oriented code transformations using two
alternative compilation profiles to quantitatively show
the effect of such transformations on typical design
objectives like performance, power dissipation, and
energy consumption. Next, by means of statistical
analysis, we collect useful data to predict the
effectiveness of a given compilation profiles for a
specific application. Information gathered from such
analysis can be exploited to drastically reduce the
computational effort needed to perform the design space
exploration.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "design space exploration; energy; genetic algorithms;
hyperblock formation; ILP; multiobjective optimization;
performances; power; statistical analysis; VLIW
architectures",
}
@Article{Leverich:2008:CEM,
author = "Jacob Leverich and Hideho Arakida and Alex
Solomatnikov and Amin Firoozshahian and Mark Horowitz
and Christos Kozyrakis",
title = "Comparative evaluation of memory models for chip
multiprocessors",
journal = j-TACO,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455651",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "There are two competing models for the on-chip memory
in Chip Multiprocessor (CMP) systems: {\em
hardware-managed coherent caches\/} and {\em
software-managed streaming memory}. This paper performs
a direct comparison of the two models under the same
set of assumptions about technology, area, and
computational capabilities. The goal is to quantify how
and when they differ in terms of performance, energy
consumption, bandwidth requirements, and latency
tolerance for general-purpose CMPs. We demonstrate that
for data-parallel applications on systems with up to 16
cores, the cache-based and streaming models perform and
scale equally well. For certain applications with
little data reuse, streaming scales better due to
better bandwidth use and macroscopic software
prefetching. However, the introduction of techniques
such as hardware prefetching and nonallocating stores
to the cache-based model eliminates the streaming
advantage. Overall, our results indicate that there is
not sufficient advantage in building streaming memory
systems where all on-chip memory structures are
explicitly managed. On the other hand, we show that
streaming at the programming model level is
particularly beneficial, even with the cache-based
model, as it enhances locality and creates
opportunities for bandwidth optimizations. Moreover, we
observe that stream programming is actually easier with
the cache-based model because the hardware guarantees
correct, best-effort execution even when the programmer
cannot fully regularize an application's code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache coherence; Chip multiprocessors; locality
optimizations; parallel programming; streaming memory",
}
@Article{Sharkey:2008:RRP,
author = "Joseph J. Sharkey and Jason Loew and Dmitry V.
Ponomarev",
title = "Reducing register pressure in {SMT} processors through
{L2}-miss-driven early register release",
journal = j-TACO,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455652",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The register file is one of the most critical datapath
components limiting the number of threads that can be
supported on a simultaneous multithreading (SMT)
processor. To allow the use of smaller register files
without degrading performance, techniques that maximize
the efficiency of using registers through aggressive
register allocation/deallocation can be considered. In
this article, we propose a novel technique to early
deallocate physical registers allocated to threads
which experience L2 cache misses. This is accomplished
by speculatively committing the load-independent
instructions and deallocating the registers
corresponding to the previous mappings of their
destinations, without waiting for the cache miss
request to be serviced. The early deallocated registers
are then made immediately available for allocation to
instructions within the same thread as well as within
other threads, thus improving the overall processor
throughput. On the average across the simulated mixes
of multiprogrammed SPEC 2000 workloads, our technique
results in 33\% improvement in throughput and 25\%
improvement in terms of harmonic mean of weighted IPCs
over the baseline SMT with the state-of-the-art DCRA
policy. This is achieved without creating checkpoints,
maintaining per-register counters of pending consumers,
performing tag rebroadcasts, register remappings,
and/or additional associative searches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "register file; Simultaneous multithreading",
}
@Article{Mehrara:2008:ESP,
author = "Mojtaba Mehrara and Todd Austin",
title = "Exploiting selective placement for low-cost memory
protection",
journal = j-TACO,
volume = "5",
number = "3",
pages = "14:1--14:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455653",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many embedded processing applications, such as those
found in the automotive or medical field, require
hardware designs that are at the same time low cost and
reliable. Traditionally, reliable memory systems have
been implemented using coded storage techniques, such
as ECC. While these designs can effectively detect and
correct memory faults such as transient errors and
single-bit defects, their use bears a significant cost
overhead. In this article, we propose a novel partial
memory protection scheme that provides high-coverage
fault protection for program code and data, but with
much lower cost than traditional approaches. Our
approach profiles program code and data usage to assess
which program elements are most critical to maintaining
program correctness. Critical code and variables are
then placed into a limited protected storage resources.
To ensure high coverage of program elements, our
placement technique considers all program components
simultaneously, including code, global variables, stack
frames, and heap variables. The fault coverage of our
approach is gauged using Monte Carlo fault-injection
experiments, which confirm that our technique provides
high levels of fault protection (99\% coverage) with
limited memory protection resources (36\% protected
area).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "fault-tolerant design; memory system design; Partial
memory protection; selective placement; transient
faults",
}
@Article{Vandierendonck:2008:SRA,
author = "Hans Vandierendonck and Andr{\'e} Seznec",
title = "Speculative return address stack management
revisited",
journal = j-TACO,
volume = "5",
number = "3",
pages = "15:1--15:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455654",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Branch prediction feeds a speculative execution
processor core with instructions. Branch mispredictions
are inevitable and have negative effects on performance
and energy consumption. With the advent of highly
accurate conditional branch predictors, nonconditional
branch instructions are gaining importance.\par
In this article, we address the prediction of procedure
returns. On modern processors, procedure returns are
predicted through a return address stack (RAS). The
overwhelming majority of the return mispredictions are
due to RAS overflows and/or overwriting the top entries
of the RAS on a mispredicted path. These sources of
misprediction were addressed by previously proposed
speculative return address stacks [Jourdan et al. 1996;
Skadron et al. 1998]. However, the remaining
misprediction rate of these RAS designs is still
significant when compared to state-of-the-art
conditional predictors.\par
We present two low-cost corruption detectors for RAS
predictors. They detect RAS overflows and wrong path
corruption with 100\% coverage. As a consequence, when
such a corruption is detected, another source can be
used for predicting the return. On processors featuring
a branch target buffer (BTB), this BTB can be used as a
free backup predictor for predicting returns when
corruption is detected.\par
Our experiments show that our proposal can be used to
improve the behavior of all previously proposed
speculative RASs. For instance, without any specific
management of the speculative states on the RAS, an
8-entry BTB-backed up RAS achieves the same performance
level as a state-of-the-art, but complex, 64-entry
self-checkpointing RAS [Jourdan et al. 1996].
Therefore, our proposal can be used either to improve
the performance of the processor or to reduce its
hardware complexity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "back-up predictor; corruption detection; Return
address prediction",
}
@Article{Chhabra:2009:MSP,
author = "Siddhartha Chhabra and Brian Rogers and Yan Solihin
and Milos Prvulovic",
title = "Making secure processors {OS}- and
performance-friendly",
journal = j-TACO,
volume = "5",
number = "4",
pages = "16:1--16:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498691",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In today's digital world, computer security issues
have become increasingly important. In particular,
researchers have proposed designs for secure processors
that utilize hardware-based memory encryption and
integrity verification to protect the privacy and
integrity of computation even from sophisticated
physical attacks. However, currently proposed schemes
remain hampered by problems that make them impractical
for use in today's computer systems: lack of virtual
memory and Inter-Process Communication support as well
as excessive storage and performance overheads. In this
article, we propose (1) address independent seed
encryption (AISE), a counter-mode-based memory
encryption scheme using a novel seed composition, and
(2) bonsai Merkle trees (BMT), a novel Merkle
tree-based memory integrity verification technique, to
eliminate these system and performance issues
associated with prior counter-mode memory encryption
and Merkle tree integrity verification schemes. We
present both a qualitative discussion and a
quantitative analysis to illustrate the advantages of
our techniques over previously proposed approaches in
terms of complexity, feasibility, performance, and
storage. Our results show that AISE+BMT reduces the
overhead of prior memory encryption and integrity
verification schemes from 12\% to 2\% on average for
single-threaded benchmarks on uniprocessor systems, and
from 15\% to 4\% for coscheduled benchmarks on
multicore systems while eliminating critical
system-level problems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "memory encryption; memory integrity verification;
Secure processor architectures; virtualization",
}
@Article{Jimenez:2009:GNB,
author = "Daniel A. Jim{\'e}nez",
title = "Generalizing neural branch prediction",
journal = j-TACO,
volume = "5",
number = "4",
pages = "17:1--17:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498692",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Improved branch prediction accuracy is essential to
sustaining instruction throughput with today's deep
pipelines. Traditional branch predictors exploit
correlations between pattern history and branch outcome
to predict branches, but there is a stronger and more
natural correlation between path history and branch
outcome. We explore the potential for exploiting this
correlation. We introduce {\em piecewise linear branch
prediction}, an idealized branch predictor that
develops a set of linear functions, one for each
program path to the branch to be predicted, that
separate predicted taken from predicted not taken
branches. Taken together, all of these linear functions
form a piecewise linear decision surface. We present a
limit study of this predictor showing its potential to
greatly improve predictor accuracy.\par
We then introduce a practical implementable branch
predictor based on piecewise linear branch prediction.
In making our predictor practical, we show how a
parameterized version of it unifies the previously
distinct concepts of perceptron prediction and
path-based neural prediction. Our new branch predictor
has implementation costs comparable to current
prominent predictors in the literature while
significantly improving accuracy. For a deeply
pipelined simulated microarchitecture our predictor
with a 256-KB hardware budget improves the harmonic
mean normalized instructions-per-cycle rate by 8\% over
both the original path-based neural predictor and
2Bc-{\em gskew}. The average misprediction rate is
decreased by 16\% over the path-based neural predictor
and by 22\% over 2Bc-{\em gskew}.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Branch prediction; machine learning",
}
@Article{Jeon:2009:AAP,
author = "Jinseong Jeon and Keoncheol Shin and Hwansoo Han",
title = "Abstracting access patterns of dynamic memory using
regular expressions",
journal = j-TACO,
volume = "5",
number = "4",
pages = "18:1--18:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Unless the speed gap between CPU and memory
disappears, efficient memory usage remains a decisive
factor for performance. To optimize data usage of
programs in the presence of the memory hierarchy, we
are particularly interested in two compiler techniques:
{\em pool allocation\/} and {\em field layout
restructuring}. Since foreseeing runtime behaviors of
programs at compile time is difficult, most of the
previous work relied on profiling. On the contrary, our
goal is to develop a fully automatic compiler that
statically transforms input codes to use memory
efficiently. Noticing that {\em regular expressions},
which denote repetition explicitly, are sufficient for
memory access patterns, we describe how to extract
memory access patterns as regular expressions in
detail. Based on static patterns presented in regular
expressions, we apply pool allocation to repeatedly
accessed structures and exploit field layout
restructuring according to field affinity relations of
chosen structures. To make a scalable framework, we
devise and apply new abstraction techniques, which
build and interpret access patterns for the whole
programs in a bottom-up fashion. We implement our
analyses and transformations with the CIL compiler. To
verify the effect and scalability of our scheme, we
examine 17 benchmarks including 2 SPECINT 2000
benchmarks whose source lines of code are larger than
10,000. Our experiments demonstrate that the static
layout transformations for dynamic memory can reduce
L1D cache misses by 16\% and execution times by 14\% on
average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Access patterns; field affinity; layout
transformation; pool allocation; regular expressions",
}
@Article{Shobaki:2009:OTS,
author = "Ghassan Shobaki and Kent Wilken and Mark Heffernan",
title = "Optimal trace scheduling using enumeration",
journal = j-TACO,
volume = "5",
number = "4",
pages = "19:1--19:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498694",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents the first optimal algorithm for
trace scheduling. The trace is a global scheduling
region used by compilers to exploit instruction-level
parallelism across basic block boundaries. Several
heuristic techniques have been proposed for trace
scheduling, but the precision of these techniques has
not been studied relative to optimality. This article
describes a technique for finding provably optimal
trace schedules, where optimality is defined in terms
of a weighted sum of schedule lengths across all code
paths in a trace. The optimal algorithm uses
branch-and-bound enumeration to efficiently explore the
entire solution space. Experimental evaluation of the
algorithm shows that, with a time limit of 1 second per
problem, 91\% of the hard trace scheduling problems in
the SPEC CPU 2006 Integer Benchmarks are solved
optimally. For 58\% of these hard problems, the optimal
schedule is improved compared to that produced by a
heuristic scheduler with a geometric mean improvement
of 3.2\% in weighted schedule length and 18\% in
compensation code size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "branch-and-bound enumeration; compiler optimizations;
global instruction scheduling; Instruction scheduling;
instruction-level parallelism; optimal instruction
scheduling; trace scheduling",
}
@Article{Kulkarni:2009:PEO,
author = "Prasad A. Kulkarni and David B. Whalley and Gary S.
Tyson and Jack W. Davidson",
title = "Practical exhaustive optimization phase order
exploration and evaluation",
journal = j-TACO,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509865",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Choosing the most appropriate optimization phase
ordering has been a long-standing problem in compiler
optimizations. Exhaustive evaluation of all possible
orderings of optimization phases for each function is
generally dismissed as infeasible for
production-quality compilers targeting accepted
benchmarks. In this article, we show that it is
possible to exhaustively evaluate the optimization
phase order space for each function in a reasonable
amount of time for most of the functions in our
benchmark suite. To achieve this goal, we used various
techniques to significantly prune the optimization
phase order search space so that it can be
inexpensively enumerated in most cases and reduce the
number of program simulations required to evaluate
program performance for each distinct phase ordering.
The techniques described are applicable to other
compilers in which it is desirable to find the best
phase ordering for most functions in a reasonable
amount of time. We also describe some interesting
properties of the optimization phase order space, which
will prove useful for further studies of related
problems in compilers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "exhaustive search; iterative compilation; Phase
ordering",
}
@Article{Hohenauer:2009:SOF,
author = "Manuel Hohenauer and Felix Engel and Rainer Leupers
and Gerd Ascheid and Heinrich Meyr",
title = "A {SIMD} optimization framework for retargetable
compilers",
journal = j-TACO,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509866",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Retargetable C compilers are currently widely used to
quickly obtain compiler support for new embedded
processors and to perform early processor architecture
exploration. A partially inherent problem of the
retargetable compilation approach, though, is the
limited code quality as compared to hand-written
compilers or assembly code due to the lack of dedicated
optimizations techniques. This problem can be
circumvented by designing flexible, retargetable code
optimization techniques that apply to a certain range
of target architectures. This article focuses on target
machines with SIMD instruction support, a common
feature in embedded processors for multimedia
applications. However, SIMD optimization is known to be
a difficult task since SIMD architectures are largely
nonuniform, support only a limited set of data types
and impose several memory alignment constraints.
Additionally, such techniques require complicated loop
transformations, which are tailored to the SIMD
architecture in order to exhibit the necessary amount
of parallelism in the code. Thus, integrating the SIMD
optimization {\em and\/} the required loop
transformations together in a single retargeting
formalism is an ambitious challenge. In this article,
we present an efficient and quickly retargetable SIMD
code optimization framework that is integrated into an
industrial retargetable C compiler. Experimental
results for different processors demonstrate that the
proposed technique applies to real-life target machines
and that it produces code quality improvements close to
the theoretical limit.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "ASIP; retargetable compilers; SIMD; subword
parallelism; vectorization",
}
@Article{Eyerman:2009:MLP,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Memory-level parallelism aware fetch policies for
simultaneous multithreading processors",
journal = j-TACO,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509867",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A thread executing on a simultaneous multithreading
(SMT) processor that experiences a long-latency load
will eventually stall while holding execution
resources. Existing long-latency load aware SMT fetch
policies limit the amount of resources allocated by a
stalled thread by identifying long-latency loads and
preventing the thread from fetching more instructions
--- and in some implementations, instructions beyond
the long-latency load are flushed to release allocated
resources.\par
This article proposes an SMT fetch policy that takes
into account the available memory-level parallelism
(MLP) in a thread. The key idea proposed in this
article is that in case of an isolated long-latency
load (i.e., there is no MLP), the thread should be
prevented from allocating additional resources.
However, in case multiple independent long-latency
loads overlap (i.e., there is MLP), the thread should
allocate as many resources as needed in order to fully
expose the available MLP. MLP-aware fetch policies
achieve better performance for MLP-intensive threads on
SMT processors, leading to higher overall system
throughput and shorter average turnaround time than
previously proposed fetch policies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Fetch Policy; Memory-Level Parallelism (MLP);
Simultaneous Multithreading (SMT)",
}
@Article{Strozek:2009:EAE,
author = "Lukasz Strozek and David Brooks",
title = "Energy- and area-efficient architectures through
application clustering and architectural
heterogeneity",
journal = j-TACO,
volume = "6",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509868",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Customizing architectures for particular applications
is a promising approach to yield highly
energy-efficient designs for embedded systems. This
work explores the benefits of architectural
customization for a class of embedded architectures
typically used in energy- and area-constrained
application domains, such as sensor nodes and
multimedia processing. We implement a process flow that
performs an automatic synthesis and evaluation of the
different architectures based on runtime profiles of
applications and determines an efficient architecture,
with consideration for both energy and area
constraints. An expressive architectural model, used by
our engine, is introduced that takes advantage of
efficient opcode allocation, several memory addressing
modes, and operand types. By profiling embedded
benchmarks from a variety of sensor and multimedia
applications, we show that the energy savings resulting
from various architectural optimizations relative to
the base architectures (e.g., MIPS and MSP430) are
significant and can reach 50\%, depending on the
application. We then identify the set of architectures
that achieves near-optimal savings for a group of
applications. Finally, we propose the use of
heterogeneous ISA processors implementing those
architectures as a solution to capitalize on energy
savings provided by application customization while
executing a range of applications efficiently.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Efficient custom architectures; heterogeneous ISA
processors",
}
@Article{Venkataramani:2009:MAM,
author = "Guru Venkataramani and Ioannis Doudalis and Yan
Solihin and Milos Prvulovic",
title = "{MemTracker}: {An} accelerator for memory debugging
and monitoring",
journal = j-TACO,
volume = "6",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543754",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory bugs are a broad class of bugs that is becoming
increasingly common with increasing software
complexity, and many of these bugs are also security
vulnerabilities. Existing software and hardware
approaches for finding and identifying memory bugs have
a number of drawbacks including considerable
performance overheads, target only a specific type of
bug, implementation cost, and inefficient use of
computational resources.\par
This article describes MemTracker, a new hardware
support mechanism that can be configured to perform
different kinds of memory access monitoring tasks.
MemTracker associates each word of data in memory with
a few bits of state, and uses a programmable state
transition table to react to different events that can
affect this state. The number of state bits per word,
the events to which MemTracker reacts, and the
transition table are all fully programmable.
MemTracker's rich set of states, events, and
transitions can be used to implement different
monitoring and debugging checkers with minimal
performance overheads, even when frequent state updates
are needed. To evaluate MemTracker, we map three
different checkers onto it, as well as a checker that
combines all three. For the most demanding (combined)
checker with 8 bits state per memory word, we observe
performance overheads of only around 3\%, on average,
and 14.5\% worst-case across different benchmark
suites. Such low overheads allow continuous (always-on)
use of MemTracker-enabled checkers, even in production
runs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Accelerator; debugging; memory access monitoring",
}
@Article{Gabor:2009:SLA,
author = "Ron Gabor and Avi Mendelson and Shlomo Weiss",
title = "Service level agreement for multithreaded processors",
journal = j-TACO,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543755",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multithreading is widely used to increase processor
throughput. As the number of shared resources increase,
managing them while guaranteeing predicted performance
becomes a major problem. Attempts have been made in
previous work to ease this via different fairness
mechanisms. In this article, we present a new approach
to control the resource allocation and sharing via a
service level agreement (SLA)-based mechanism; that is,
via an agreement in which multithreaded processors
guarantee a minimal level of service to the running
threads. We introduce a new metric, {\em C\/}$_{SLA}$,
for conformance to SLA in multithreaded processors and
show that controlling resources using with SLA allows
for higher gains than are achievable by previously
suggested fairness techniques. It also permits
improving one metric (e.g., power) while maintaining
SLA in another (e.g., performance). We compare SLA
enforcement to schemes based on other fairness metrics,
which are mostly targeted at equalizing execution
parameters. We show that using SLA rather than fairness
based algorithms provides a range of acceptable
execution points from which we can select the point
that best fits our optimization target, such as
maximizing the weighted speedup (sum of the speedups of
the individual threads) or reducing power. We
demonstrate the effectiveness of the new SLA approach
using switch-on-event (coarse-grained) multithreading.
Our weighted speedup improvement scheme successfully
enforces SLA while improving the weighted speedup by an
average of 10\% for unbalanced threads. This result is
significant when compared with performance losses that
may be incurred by fairness enforcement methods. When
optimizing for power reduction in unbalanced threads
SLA enforcement reduces the power by an average of
15\%. SLA may be complemented by other power reduction
methods to achieve further power savings {\em and\/}
maintain the same service level for the threads. We
also demonstrate differentiated SLA, where weighted
speedup is maximized while each thread may have a
different throughput constraint.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "fairness; performance; power; Service level agreement;
throughput",
}
@Article{Fung:2009:DWF,
author = "Wilson W. L. Fung and Ivan Sham and George Yuan and
Tor M. Aamodt",
title = "Dynamic warp formation: {Efficient MIMD} control flow
on {SIMD} graphics hardware",
journal = j-TACO,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543756",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent advances in graphics processing units (GPUs)
have resulted in massively parallel hardware that is
easily programmable and widely available in today's
desktop and notebook computer systems. GPUs typically
use single-instruction, multiple-data (SIMD) pipelines
to achieve high performance with minimal overhead for
control hardware. Scalar threads running the same
computing kernel are grouped together into SIMD
batches, sometimes referred to as warps. While SIMD is
ideally suited for simple programs, recent GPUs include
control flow instructions in the GPU instruction set
architecture and programs using these instructions may
experience reduced performance due to the way branch
execution is supported in hardware. One solution is to
add a stack to allow different SIMD processing elements
to execute distinct program paths after a branch
instruction. The occurrence of diverging branch
outcomes for different processing elements
significantly degrades performance using this approach.
In this article, we propose dynamic warp formation and
scheduling, a mechanism for more efficient SIMD branch
execution on GPUs. It dynamically regroups threads into
new warps on the fly following the occurrence of
diverging branch outcomes. We show that a realistic
hardware implementation of this mechanism improves
performance by 13\%, on average, with 256 threads per
core, 24\% with 512 threads, and 47\% with 768 threads
for an estimated area increase of 8\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "control flow; fine-grained multithreading; GPU; SIMD",
}
@Article{Koh:2009:TPV,
author = "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai
Li",
title = "Tolerating process variations in large,
set-associative caches: {The} buddy cache",
journal = j-TACO,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543757",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "One important trend in today's microprocessor
architectures is the increase in size of the processor
caches. These caches also tend to be set associative.
As technology scales, process variations are expected
to increase the fault rates of the SRAM cells that
compose such caches. As an important component of the
processor, the parametric yield of SRAM cells is
crucial to the overall performance and yield of the
microchip. In this article, we propose a
microarchitectural solution, called the buddy cache
that permits large, set-associative caches to tolerate
faults in SRAM cells due to process variations. In
essence, instead of disabling a faulty cache block in a
set (as is the current practice), it is paired with
another faulty cache block in the same set --- the
buddy. Although both cache blocks are faulty, if the
faults of the two blocks do not overlap, then instead
of losing two blocks, buddying will yield a functional
block from the nonfaulty portions of the two blocks. We
found that with buddying, caches can better mitigate
the negative impacts of process variations on
performance and yield, gracefully downgrading
performance as opposed to catastrophic failure. We will
describe the details of the buddy cache and give
insights as to why it is both more performance and
yield resilient to faults.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "caches; fault recovery; memory structures; Processor
architectures",
}
@Article{Li:2009:CDS,
author = "Lian Li and Hui Feng and Jingling Xue",
title = "Compiler-directed scratchpad memory management via
graph coloring",
journal = j-TACO,
volume = "6",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582711",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Scratchpad memory (SPM), a fast on-chip SRAM managed
by software, is widely used in embedded systems. This
article introduces a general-purpose compiler approach,
called memory coloring, to assign static data
aggregates, such as arrays and structs, in a program to
an SPM. The novelty of this approach lies in
partitioning the SPM into a pseudo--register file (with
interchangeable and aliased registers), splitting the
live ranges of data aggregates to create potential data
transfer statements between SPM and off-chip memory,
and finally, adapting an existing graph coloring
algorithm for register allocation to assign the data
aggregates to the pseudo--register file. Our
experimental results using a set of 10 C benchmarks
from MediaBench and MiBench show that our methodology
is capable of managing SPMs efficiently and effectively
for large embedded applications. In addition, our SPM
allocator can obtain close to optimal solutions when
evaluated and compared against an existing
heuristics-based SPM allocator and an ILP-based SPM
allocator.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "graph coloring; live range splitting; memory
allocation; memory coloring; register coalescing;
Scratchpad memory; software-managed cache",
}
@Article{Golander:2009:CAR,
author = "Amit Golander and Shlomo Weiss",
title = "Checkpoint allocation and release",
journal = j-TACO,
volume = "6",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Out-of-order speculative processors need a bookkeeping
method to recover from incorrect speculation. In recent
years, several microarchitectures that employ
checkpoints have been proposed, either extending the
reorder buffer or entirely replacing it. This work
presents an in-dept-study of checkpointing in
checkpoint-based microarchitectures, from the desired
content of a checkpoint, via implementation trade-offs,
and to checkpoint allocation and release policies. A
major contribution of the article is a novel adaptive
checkpoint allocation policy that outperforms known
policies. The adaptive policy controls checkpoint
allocation according to dynamic events, such as
second-level cache misses and rollback history. It
achieves 6.8\% and 2.2\% speedup for the integer and
floating point benchmarks, respectively, and does not
require a branch confidence estimator. The results show
that the proposed adaptive policy achieves most of the
potential of an oracle policy whose performance
improvement is 9.8\% and 3.9\% for the integer and
floating point benchmarks, respectively. We exploit
known techniques for saving leakage power by adapting
and applying them to checkpoint-based
microarchitectures. The proposed applications combine
to reduce the leakage power of the register file to
about one half of its original value.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Checkpoint; early register release; leakage;
misprediction; out-of-order execution; rollback",
}
@Article{Xu:2009:TXP,
author = "Weifeng Xu and Russell Tessier",
title = "{Tetris-XL}: a performance-driven spill reduction
technique for embedded {VLIW} processors",
journal = j-TACO,
volume = "6",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As technology has advanced, the application space of
Very Long Instruction Word (VLIW) processors has grown
to include a variety of embedded platforms. Due to cost
and power consumption constraints, many embedded VLIW
processors contain limited resources, including
registers. As a result, a VLIW compiler that maximizes
instruction level parallelism (ILP) without considering
register constraints may generate excessive register
spills, leading to reduced overall system performance.
To address this issue, this article presents a new
spill reduction technique that improves VLIW runtime
performance by reordering operations prior to register
allocation and instruction scheduling. Unlike earlier
algorithms, our approach explicitly considers both
register reduction and data dependency in performing
operation reordering. Data dependency control limits
unexpected schedule length increases during subsequent
instruction scheduling. Our technique has been
evaluated using Trimaran, an academic VLIW compiler,
and evaluated using a set of embedded systems
benchmarks. Experimental results show that, on average,
this technique improves VLIW performance by 10\% for
VLIW processors with 32 registers and 8 functional
units compared with previous spill reduction
techniques. Limited improvement is seen versus prior
approaches for VLIW processors with 64 registers and 8
functional units.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "instruction level parallelism; Register pressure; Very
Long Instruction Word (VLIW) processor",
}
@Article{Jones:2009:ELE,
author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
title = "Exploring the limits of early register release:
{Exploiting} compiler analysis",
journal = j-TACO,
volume = "6",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Register pressure in modern superscalar processors can
be reduced by releasing registers early and by copying
their contents to cheap back-up storage. This article
quantifies the potential benefits of register occupancy
reduction and shows that existing hardware-based
schemes typically achieve only a small fraction of this
potential. This is because they are unable to
accurately determine the last use of a register and
must wait until the redefining instruction enters the
pipeline. On the other hand, compilers have a global
view of the program and, using simple dataflow
analysis, can determine the last use. This article
evaluates the extent to which compiler analysis can aid
early releasing, explores the design space, and
introduces commit and issue-based early releasing
schemes, quantifying their benefits. Using simple
compiler analysis and microarchitecture changes, we
achieve 70\% of the potential register file occupancy
reduction. By adding more hardware support, we can
increase this to 94\%. Our schemes are compared to
state-of-the-art approaches for varying register file
sizes and are shown to outperform these existing
techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "compiler; energy efficiency; Low-power design;
microarchitecture; register file",
}
@Article{Jones:2009:EER,
author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
title = "Energy-efficient register caching with compiler
assistance",
journal = j-TACO,
volume = "6",
number = "4",
pages = "13:1--13:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2009:TUC,
author = "Weijia Li and Youtao Zhang and Jun Yang and Jiang
Zheng",
title = "Towards update-conscious compilation for
energy-efficient code dissemination in {WSNs}",
journal = j-TACO,
volume = "6",
number = "4",
pages = "14:1--14:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wegiel:2009:SRC,
author = "Michal Wegiel and Chandra Krintz",
title = "The single-referent collector: {Optimizing} compaction
for the common case",
journal = j-TACO,
volume = "6",
number = "4",
pages = "15:1--15:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Subramaniam:2009:DOS,
author = "Samantika Subramaniam and Gabriel H. Loh",
title = "Design and optimization of the store vectors memory
dependence predictor",
journal = j-TACO,
volume = "6",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2010:PAM,
author = "Xiaohang Wang and Mei Yang and Yingtao Jiang and Peng
Liu",
title = "A power-aware mapping approach to map {IP} cores onto
{NoCs} under bandwidth and latency constraints",
journal = j-TACO,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1736065.1736066",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we investigate the Intellectual
Property (IP) mapping problem that maps a given set of
IP cores onto the tiles of a mesh-based Network-on-Chip
(NoC) architecture such that the power consumption due
to intercore communications is minimized. This IP
mapping problem is considered under both bandwidth and
latency constraints as imposed by the applications and
the on-chip network infrastructure. By examining
various applications' communication characteristics
extracted from their respective communication trace
graphs, two distinguishable connectivity templates are
realized: the graphs with tightly coupled vertices and
those with distributed vertices. These two templates
are formally defined in this article, and different
mapping heuristics are subsequently developed to map
them. In general, tightly coupled vertices are mapped
onto tiles that are physically close to each other
while the distributed vertices are mapped following a
graph partition scheme. Experimental results on both
random and multimedia benchmarks have confirmed that
the proposed template-based mapping algorithm achieves
an average of 15\% power savings as compared with MOCA,
a fast greedy-based mapping algorithm. Compared with a
branch-and-bound--based mapping algorithm, which
produces near optimal results but incurs an extremely
high computation cost, the proposed algorithm, due to
its polynomial runtime complexity, can generate the
results of almost the same quality with much less CPU
time. As the on-chip network size increases, the
superiority of the proposed algorithm becomes more
evident.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "bandwidth and latency constraints; IP mapping; Low
power; network-on-chip (NoC)",
}
@Article{Chen:2010:HSF,
author = "Zhong-Ho Chen and Alvin W. Y. Su",
title = "A hardware\slash software framework for instruction
and data scratchpad memory allocation",
journal = j-TACO,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1736065.1736067",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Previous researches show that a scratchpad memory
device consumed less energy than a cache device with
the same capacity. In this article, we locate the
scratchpad memory (SPM) in the top level of the memory
hierarchy to reduce the energy consumption. To take the
advantage of a SPM, we address two issues of utilizing
a SPM. First, the program's locality should be
improved. The second issue is SPM management. To tackle
these two issues, we present a hardware/software
framework for dynamically allocating both instructions
and data in SPM. The software flow could be divided
into three phases: locality improving, locality
extraction, and runtime SPM management. Without
modifying the original compiler and the source code, we
improve the locality of a program. An optimization
algorithm is proposed to extract the SPM allocations.
At runtime, an SPM management program is employed. In
hardware, an address translation logic (ATL) is
proposed to reduce the overhead of SPM
management.\par
The results show that the proposed framework can reduce
energy delay product (EDP) by 63\%, on average, when
compared with the traditional cache architecture. The
reduction in EDP is contributed by properly allocating
both instructions and data in SPM. By allocating only
instructions in SPM, the EDPs are reduced by 45\%, on
average. By allocating only data in SPM, the EDPs are
reduced by 14\%, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "allocation algorithm; Memory allocation; scratchpad
memory",
}
@Article{Woo:2010:CVI,
author = "Dong Hyuk Woo and Joshua B. Fryman and Allan D. Knies
and Hsien-Hsin S. Lee",
title = "{Chameleon}: {Virtualizing} idle acceleration cores of
a heterogeneous multicore processor for caching and
prefetching",
journal = j-TACO,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1736065.1736068",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Heterogeneous multicore processors have emerged as an
energy- and area-efficient architectural solution to
improving performance for domain-specific applications
such as those with a plethora of data-level
parallelism. These processors typically contain a large
number of small, compute-centric cores for acceleration
while keeping one or two high-performance ILP cores on
the die to guarantee single-thread performance.
Although a major portion of the transistors are
occupied by the acceleration cores, these resources
will sit idle when running unparallelized legacy codes
or the sequential part of an application. To address
this underutilization issue, in this article, we
introduce Chameleon, a flexible heterogeneous multicore
architecture to virtualize these resources for
enhancing memory performance when running sequential
programs. The Chameleon architecture can dynamically
virtualize the idle acceleration cores into a
last-level cache, a data prefetcher, or a hybrid
between these two techniques. In addition, Chameleon
can operate in an adaptive mode that dynamically
configures the acceleration cores between the hybrid
mode and the prefetch-only mode by monitoring the
effectiveness of the Chameleon cache mode. In our
evaluation with SPEC2006 benchmark suite, different
levels of performance improvements were achieved in
different modes for different applications. In the case
of the adaptive mode, Chameleon improves the
performance of SPECint06 and SPECfp06 by 31\% and 15\%,
on average. When considering only memory-intensive
applications, Chameleon improves the system performance
by 50\% and 26\% for SPECint06 and SPECfp06,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache; Heterogeneous multicore; idle core;
prefetching",
}
@Article{Sanchez:2010:ACI,
author = "Daniel Sanchez and George Michelogiannakis and
Christos Kozyrakis",
title = "An analysis of on-chip interconnection networks for
large-scale chip multiprocessors",
journal = j-TACO,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1756065.1736069",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the number of cores of chip multiprocessors
(CMPs) rapidly growing as technology scales down,
connecting the different components of a CMP in a
scalable and efficient way becomes increasingly
challenging. In this article, we explore the
architectural-level implications of interconnection
network design for CMPs with up to 128 fine-grain
multithreaded cores. We evaluate and compare different
network topologies using accurate simulation of the
full chip, including the memory hierarchy and
interconnect, and using a diverse set of scientific and
engineering workloads.\par
We find that the interconnect has a large impact on
performance, as it is responsible for 60\% to 75\% of
the miss latency. Latency, and not bandwidth, is the
primary performance constraint, since, even with many
threads per core and workloads with high miss rates,
networks with enough bandwidth can be efficiently
implemented for the system scales we consider. From the
topologies we study, the flattened butterfly
consistently outperforms the mesh and fat tree on all
workloads, leading to performance advantages of up to
22\%. We also show that considering interconnect and
memory hierarchy together when designing large-scale
CMPs is crucial, and neglecting either of the two can
lead to incorrect conclusions. Finally, the effect of
the interconnect on overall performance becomes more
important as the number of cores increases, making
interconnection choices especially critical when
scaling up.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "chip multiprocessors; hierarchical networks;
Networks-on-chip",
}
@Article{Zhou:2010:PAT,
author = "Xiuyi Zhou and Jun Yang and Marek Chrobak and Youtao
Zhang",
title = "Performance-aware thermal management via task
scheduling",
journal = j-TACO,
volume = "7",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1746065.1736070",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High on-chip temperature impairs the processor's
reliability and reduces its lifetime. Hardware-level
dynamic thermal management (DTM) techniques can
effectively constrain the chip temperature, but
degrades the performance. We propose an OS-level
technique that performs thermal-aware job scheduling to
reduce DTMs. The algorithm is based on the observation
that hot and cool jobs executed in a different order
can make a difference in resulting temperature.
Real-system implementation in Linux shows that our
scheduler can remove 10.5\% to 73.6\% of the hardware
DTMs in a medium thermal environment. The CPU
throughput is improved by up to 7.6\% (4.1\%, on
average) in a severe thermal environment.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "task scheduling; Thermal management",
}
@Article{Raghavan:2010:TTP,
author = "Arun Raghavan and Colin Blundell and Milo M. K.
Martin",
title = "Token tenure and {PATCH}: a predictive\slash adaptive
token-counting hybrid",
journal = j-TACO,
volume = "7",
number = "2",
pages = "6:1--6:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839668",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Traditional coherence protocols present a set of
difficult trade-offs: the reliance of snoopy protocols
on broadcast and ordered interconnects limits their
scalability, while directory protocols incur a
performance penalty on sharing misses due to
indirection. This work introduces Patch
(Predictive/Adaptive Token-Counting Hybrid), a
coherence protocol that provides the scalability of
directory protocols while opportunistically sending
direct requests to reduce sharing latency. Patch
extends a standard directory protocol to track tokens
and use token-counting rules for enforcing coherence
permissions. Token counting allows Patch to support
direct requests on an unordered interconnect, while a
mechanism called {\em token tenure\/} provides
broadcast-free forward progress using the directory
protocol's per-block point of ordering at the home
along with either timeouts at requesters or explicit
race notification messages.\par
Patch makes three main contributions. First, Patch
introduces token tenure, which provides broadcast-free
forward progress for token-counting protocols. Second,
Patch deprioritizes best-effort direct requests to
match or exceed the performance of directory protocols
without restricting scalability. Finally, Patch
provides greater scalability than directory protocols
when using inexact encodings of sharers because only
processors holding tokens need to acknowledge requests.
Overall, Patch is a ``one-size-fits-all'' coherence
protocol that dynamically adapts to work well for small
systems, large systems, and anywhere in between.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "adaptive; bandwidth-efficiency; Cache coherence
protocol; predictive; token coherence",
}
@Article{Wimmer:2010:AFD,
author = "Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}sck",
title = "Automatic feedback-directed object fusing",
journal = j-TACO,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839669",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Object fusing is an optimization that embeds certain
referenced objects into their referencing object. The
order of objects on the heap is changed in such a way
that objects that are accessed together are placed next
to each other in memory. Their offset is then fixed,
that is, the objects are colocated, allowing field
loads to be replaced by address arithmetic. Array
fusing specifically optimizes arrays, which are
frequently used for the implementation of dynamic data
structures. Therefore, the length of arrays often
varies, and fields referencing such arrays have to be
changed. An efficient code pattern detects these
changes and allows the optimized access of such
fields.\par
We integrated these optimizations into Sun
Microsystems' Java HotSpot\TM{} VM. The analysis is
performed automatically at runtime, requires no actions
on the part of the programmer, and supports dynamic
class loading. To safely eliminate a field load, the
colocation of the object that holds the field and the
object that is referenced by the field must be
guaranteed. Two preconditions must be satisfied: The
objects must be allocated at the same time, and the
field must not be overwritten later. These
preconditions are checked by the just-in-time compiler
to avoid an interprocedural data flow analysis. The
garbage collector ensures that groups of colocated
objects are not split by copying groups as a whole. The
evaluation shows that the dynamic approach successfully
identifies and optimizes frequently accessed fields for
several benchmarks with a low compilation and analysis
overhead. It leads to a speedup of up to 76\% for
simple benchmarks and up to 6\% for complex
workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache performance; garbage collection; Java;
just-in-time compilation; object colocation; object
fusing; object inlining; optimization",
}
@Article{Lee:2010:AIC,
author = "Benjamin C. Lee and David Brooks",
title = "Applied inference: {Case} studies in
microarchitectural design",
journal = j-TACO,
volume = "7",
number = "2",
pages = "8:1--8:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839670",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose and apply a new simulation paradigm for
microarchitectural design evaluation and optimization.
This paradigm enables more comprehensive design studies
by combining spatial sampling and statistical
inference. Specifically, this paradigm (i) defines a
large, comprehensive design space, (ii) samples points
from the space for simulation, and (iii) constructs
regression models based on sparse simulations. This
approach greatly improves the computational efficiency
of microarchitectural simulation and enables new
capabilities in design space exploration.\par
We illustrate new capabilities in three case studies
for a large design space of approximately 260,000
points: (i) Pareto frontier, (ii) pipeline depth, and
(iii) multiprocessor heterogeneity analyses. In
particular, regression models are exhaustively
evaluated to identify Pareto optimal designs that
maximize performance for given power budgets. These
models enable pipeline depth studies in which all
parameters vary simultaneously with depth, thereby more
effectively revealing interactions with nondepth
parameters. Heterogeneity analysis combines
regression-based optimization with clustering
heuristics to identify efficient design compromises
between similar optimal architectures. These
compromises are potential core designs in a
heterogeneous multicore architecture. Increasing
heterogeneity can improve {\em bips\/}$^3$ / {\em w\/}
efficiency by as much as 2.4\times , a theoretical
upper bound on heterogeneity benefits that neglects
contention between shared resources as well as design
complexity. Collectively these studies demonstrate
regression models' ability to expose trends and
identify optima in diverse design regions, motivating
the application of such models in statistical inference
for more effective use of modern simulator
infrastructure.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Microarchitecture; regression; simulation;
statistics",
}
@Article{Rakvic:2010:TMT,
author = "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G.
Magklis and P. Chaparro and A. Gonz{\'a}lez",
title = "Thread-management techniques to maximize efficiency in
multicore and simultaneous multithreaded
microprocessors",
journal = j-TACO,
volume = "7",
number = "2",
pages = "9:1--9:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839671",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We provide an analysis of thread-management techniques
that increase performance or reduce energy in multicore
and Simultaneous Multithreaded (SMT) cores. Thread
delaying reduces energy consumption by running the core
containing the critical thread at maximum frequency
while scaling down the frequency and voltage of the
cores containing noncritical threads. In this article,
we provide an insightful breakdown of thread delaying
on a simulated multi-core microprocessor. Thread
balancing improves overall performance by giving higher
priority to the critical thread in the issue queue of
an SMT core. We provide a detailed breakdown of
performance results for thread-balancing, identifying
performance benefits and limitations. For those
benchmarks where a performance benefit is not possible,
we introduce a novel thread-balancing mechanism on an
SMT core that can reduce energy consumption. We have
performed a detailed study on an Intel microprocessor
simulator running parallel applications. Thread
delaying can reduce energy consumption by 4\% to 44\%
with negligible performance loss. Thread balancing can
increase performance by 20\% or can reduce energy
consumption by 23\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "critical threads; energy-aware; low-power; Meeting
point thread characterization; microarchitecture;
multi-threaded application; thread balancing; thread
delaying",
}
@Article{Pao:2010:MEP,
author = "Derek Pao and Wei Lin and Bin Liu",
title = "A memory-efficient pipelined implementation of the
{Aho--Corasick} string-matching algorithm",
journal = j-TACO,
volume = "7",
number = "2",
pages = "10:1--10:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839672",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With rapid advancement in Internet technology and
usages, some emerging applications in data
communications and network security require matching of
huge volume of data against large signature sets with
thousands of strings in real time. In this article, we
present a memory-efficient hardware implementation of
the well-known Aho--Corasick (AC) string-matching
algorithm using a pipelining approach called P-AC. An
attractive feature of the AC algorithm is that it can
solve the string-matching problem in time linearly
proportional to the length of the input stream, and the
computation time is independent of the number of
strings in the signature set. A major disadvantage of
the AC algorithm is the high memory cost required to
store the transition rules of the underlying
deterministic finite automaton. By incorporating
pipelined processing, the state graph is reduced to a
character trie that only contains forward edges.
Together with an intelligent implementation of look-up
tables, the memory cost of P-AC is only about 18 bits
per character for a signature set containing 6,166
strings extracted from Snort. The control structure of
P-AC is simple and elegant. The cost of the control
logic is very low. With the availability of dual-port
memories in FPGA devices, we can double the system
throughput by duplicating the control logic such that
the system can process two data streams concurrently.
Since our method is memory-based, incremental changes
to the signature set can be accommodated by updating
the look-up tables without reconfiguring the FPGA
circuitry.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "deterministic and nondeterministic finite automaton;
intrusion detection system; pipelined processing;
String-matching",
}
@Article{Yang:2010:ERS,
author = "Xuejun Yang and Ying Zhang and Xicheng Lu and Jingling
Xue and Ian Rogers and Gen Li and Guibin Wang and
Xudong Fang",
title = "Exploiting the reuse supplied by loop-dependent stream
references for stream processors",
journal = j-TACO,
volume = "7",
number = "2",
pages = "11:1--11:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839673",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory accesses limit the performance of stream
processors. By exploiting the reuse of data held in the
Stream Register File (SRF), an on-chip, software
controlled storage, the number of memory accesses can
be reduced. In current stream compilers, reuse
exploitation is only attempted for simple stream
references, those whose start and end are known.
Compiler analysis, from outside of stream processors,
does not directly enable the consideration of other
more complex stream references. In this article, we
propose a transformation to automatically optimize
stream programs to exploit the reuse supplied by
loop-dependent stream references. The transformation is
based on three results: lemmas identifying the reuse
supplied by stream references, a new abstract
representation called the Stream Reuse Graph (SRG)
depicting the identified reuse, and the optimization of
the SRG for our transformation. Both the reuse between
the whole sequences accessed by stream references and
between partial sequences is exploited in the article.
In particular, partial reuse and its treatment are
quite new and have never, to the best of our knowledge,
appeared in scalar and vector processing. At the same
time, reusing streams increases the pressure on the
SRF, and this presents a problem of which reuse should
be exploited within limited SRF capacity. We extend our
analysis to achieve this objective. Finally, we
implement our techniques based on the StreamC/KernelC
compiler that has been optimized with the best existing
compilation techniques for stream processors.
Experimental results show a resultant speed-up of 1.14
to 2.54 times using a range of benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "stream professor; Stream programming model; stream
register file; stream reuse; streamc",
}
@Article{Reddi:2010:EVE,
author = "Vijay Janapa Reddi and Simone Campanoni and Meeta S.
Gupta and Michael D. Smith and Gu-Yeon Wei and David
Brooks and Kim Hazelwood",
title = "Eliminating voltage emergencies via software-guided
code transformations",
journal = j-TACO,
volume = "7",
number = "2",
pages = "12:1--12:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839674",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In recent years, circuit reliability in modern
high-performance processors has become increasingly
important. Shrinking feature sizes and diminishing
supply voltages have made circuits more sensitive to
microprocessor supply voltage fluctuations. These
fluctuations result from the natural variation of
processor activity as workloads execute, but when left
unattended, these voltage fluctuations can lead to
timing violations or even transistor lifetime issues.
In this article, we present a hardware--software
collaborative approach to mitigate voltage
fluctuations. A checkpoint-recovery mechanism rectifies
errors when voltage violates maximum tolerance
settings, while a runtime software layer reschedules
the program's instruction stream to prevent recurring
violations at the same program location. The runtime
layer, combined with the proposed code-rescheduling
algorithm, removes 60\% of all violations with minimal
overhead, thereby significantly improving overall
performance. Our solution is a radical departure from
the ongoing industry-standard approach to circumvent
the issue altogether by optimizing for the worst-case
voltage flux, which compromises power and performance
efficiency severely, especially looking ahead to future
technology generations. Existing conservative
approaches will have severe implications on the ability
to deliver efficient microprocessors. The proposed
technique reassembles a traditional reliability problem
as a runtime performance optimization problem, thus
allowing us to design processors for typical case
operation by building intelligent algorithms that can
prevent recurring violations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "dI/dt; inductive noise; voltage emergencies; Voltage
noise",
}
@Article{Zhao:2010:PPP,
author = "Qin Zhao and Ioana Cutcutache and Weng-Fai Wong",
title = "{PiPA}: {Pipelined} profiling and analysis on
multicore systems",
journal = j-TACO,
volume = "7",
number = "3",
pages = "13:1--13:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880038",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Profiling and online analysis are important tasks in
program understanding and feedback-directed
optimization. However, fine-grained profiling and
online analysis tend to seriously slow down the
application. To cope with the slowdown, one may have to
terminate the process early or resort to sampling. The
former tends to distort the result because of warm-up
effects. The latter runs the risk of missing important
effects because sampling was turned off during the time
that these effects appeared. A promising approach is to
make use of the parallel processing capabilities of the
now ubiquitous multicore processors to speed up the
profiling and analysis process.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Guo:2010:QSS,
author = "Fei Guo and Yan Solihin and Li Zhao and Ravishankar
Iyer",
title = "Quality of service shared cache management in chip
multiprocessor architecture",
journal = j-TACO,
volume = "7",
number = "3",
pages = "14:1--14:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880039",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The trends in enterprise IT toward service-oriented
computing, server consolidation, and virtual computing
point to a future in which workloads are becoming
increasingly diverse in terms of performance,
reliability, and availability requirements. It can be
expected that more and more applications with diverse
requirements will run on a Chip Multi-Processor (CMP)
and share platform resources such as the lowest level
cache and off-chip bandwidth. In this environment, it
is desirable to have microarchitecture and software
support that can provide a guarantee of a certain level
of performance, which we refer to as performance
Quality of Service. In this article, we investigated a
framework would be needed to manage the shared cache
resource for fully providing QoS in a CMP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2010:DEH,
author = "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan
Speight and Ram Rajamony and Yuan Xie",
title = "Design exploration of hybrid caches with disparate
memory technologies",
journal = j-TACO,
volume = "7",
number = "3",
pages = "15:1--15:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880040",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Traditional multilevel SRAM-based cache hierarchies,
especially in the context of chip multiprocessors
(CMPs), present many challenges in area requirements,
core--to--cache balance, power consumption, and design
complexity. New advancements in technology enable
caches to be built from other technologies, such as
Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and
Phase-change RAM (PRAM), in both 2D chips or 3D stacked
chips. Caches fabricated in these technologies offer
dramatically different power-performance
characteristics when compared with SRAM-based caches,
particularly in the areas of access latency, cell
density, and overall power consumption. In this
article, we propose to take advantage of the best
characteristics that each technology has to offer
through the use of Hybrid Cache Architecture (HCA)
designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kourtis:2010:ECO,
author = "Kornilios Kourtis and Georgios Goumas and Nectarios
Koziris",
title = "Exploiting compression opportunities to improve
{SpMxV} performance on shared memory systems",
journal = j-TACO,
volume = "7",
number = "3",
pages = "16:1--16:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880041",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Sparse Matrix-Vector Multiplication (SpMxV) kernel
exhibits poor scaling on shared memory systems, due to
the streaming nature of its data access pattern. To
decrease memory contention and improve kernel
performance we propose two compression schemes: CSR-DU,
that targets the reduction of the matrix structural
data by applying coarse-grained delta-encoding, and
CSR-VI, that targets the reduction of the values using
indirect indexing, applicable to matrices with a small
number of unique values.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Buyukkurt:2010:IHL,
author = "Betul Buyukkurt and John Cortes and Jason Villarreal
and Walid A. Najjar",
title = "Impact of high-level transformations within the
{ROCCC} framework",
journal = j-TACO,
volume = "7",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880044",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hwang:2010:DCR,
author = "Yuan-Shin Hwang and Tzong-Yen Lin and Rong-Guey
Chang",
title = "{DisIRer}: {Converting} a retargetable compiler into a
multiplatform binary translator",
journal = j-TACO,
volume = "7",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880045",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Boyer:2010:FBP,
author = "Michael Boyer and David Tarjan and Kevin Skadron",
title = "Federation: {Boosting} per-thread performance of
throughput-oriented manycore architectures",
journal = j-TACO,
volume = "7",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880046",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fursin:2010:COP,
author = "Grigori Fursin and Olivier Temam",
title = "Collective optimization: a practical collaborative
approach",
journal = j-TACO,
volume = "7",
number = "4",
pages = "20:1--20:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880047",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2010:UBI,
author = "Fang Liu and Yan Solihin",
title = "Understanding the behavior and implications of context
switch misses",
journal = j-TACO,
volume = "7",
number = "4",
pages = "21:1--21:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880048",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eyerman:2011:FGD,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Fine-grained {DVFS} using on-chip regulators",
journal = j-TACO,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1952999",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Limit studies on Dynamic Voltage and Frequency Scaling
(DVFS) provide apparently contradictory conclusions. On
the one hand early limit studies report that DVFS is
effective at large timescales (on the order of
million(s) of cycles) with large scaling overheads (on
the order of tens of microseconds), and they conclude
that there is no need for small overhead DVFS at small
timescales. Recent work on the other hand --- motivated
by the surge of on-chip voltage regulator research ---
explores the potential of fine-grained DVFS and reports
substantial energy savings at timescales of hundreds of
cycles (while assuming no scaling overhead). This
article unifies these apparently contradictory
conclusions through a DVFS limit study that
simultaneously explores timescale and scaling speed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cher:2011:EEC,
author = "Chen-Yong Cher and Eren Kursun",
title = "Exploring the effects of on-chip thermal variation on
high-performance multicore architectures",
journal = j-TACO,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953000",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Inherent temperature variation among cores in a
multicore architecture can be caused by a number of
factors including process variation, cooling and
packaging imperfections, and even placement of the chip
in the module. Current dynamic thermal management
techniques assume identical heating profiles for
homogeneous multicore architectures. Our experimental
results indicate that inherent thermal variation is
very common in existing multicores. While most
multicore chips accommodate multiple thermal sensors,
the dynamic power/thermal management schemes are
oblivious of the inherent heating tendencies. Hence, in
the case of variation, the chip faces repetitive
hotspots running on such cores. In this article, we
propose a technique that leverages the on-chip sensor
infrastructure as well as the capabilities of
power/thermal management to effectively reduce the
heating and minimize local hotspots.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2011:ATR,
author = "Carole-Jean Wu and Margaret Martonosi",
title = "Adaptive timekeeping replacement: Fine-grained
capacity management for shared {CMP} caches",
journal = j-TACO,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953001",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In chip multiprocessors (CMPs), several
high-performance cores typically compete for capacity
in a shared last-level cache. This causes degraded and
unpredictable memory performance for multiprogrammed
and parallel workloads. In response, recent schemes
apportion cache bandwidth and capacity in ways that
offer better aggregate performance for the workloads.
These schemes, however, focus primarily on relatively
coarse-grained capacity management without concern for
operating system process priority levels. In this work,
we explore capacity management approaches that are both
temporally and spatially more fine-grained than prior
work. We also consider operating system priority levels
as part of capacity management. We propose a capacity
management mechanism based on timekeeping techniques
that track the time interval since the last access to
cached data.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vespa:2011:DFA,
author = "Lucas Vespa and Ning Weng",
title = "Deterministic finite automata characterization and
optimization for scalable pattern matching",
journal = j-TACO,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953002",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory-based Deterministic Finite Automata (DFA) are
ideal for pattern matching in network intrusion
detection systems due to their deterministic
performance and ease of update of new patterns, however
severe DFA memory requirements make it impractical to
implement thousands of patterns. This article aims to
understand the basic relationship between DFA
characteristics and memory requirements, and to design
a practical memory-based pattern matching engine. We
present a methodology that consists of theoretical DFA
characterization, encoding optimization, and
implementation architecture. Results show the validity
of the characterization metrics, effectiveness of the
encoding techniques, and efficiency of the memory-based
pattern engines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bhattacharjee:2011:PLC,
author = "Abhishek Bhattacharjee and Gilberto Contreras and
Margaret Martonosi",
title = "Parallelization libraries: Characterizing and reducing
overheads",
journal = j-TACO,
volume = "8",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953003",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Creating efficient, scalable dynamic parallel runtime
systems for chip multiprocessors (CMPs) requires
understanding the overheads that manifest at high core
counts and small task sizes. In this article, we assess
these overheads on Intel's Threading Building Blocks
(TBB) and OpenMP. First, we use real hardware and
simulations to detail various scheduler and
synchronization overheads. We find that these can
amount to 47\% of TBB benchmark runtime and 80\% of
OpenMP benchmark runtime. Second, we propose load
balancing techniques such as occupancy-based and
criticality-guided task stealing, to boost performance.
Overall, our study provides valuable insights for
creating robust, scalable runtime libraries.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dong:2011:HCU,
author = "Xiangyu Dong and Yuan Xie and Naveen Muralimanohar and
Norman P. Jouppi",
title = "Hybrid checkpointing using emerging nonvolatile
memories for future exascale systems",
journal = j-TACO,
volume = "8",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970387",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The scalability of future Massively Parallel
Processing (MPP) systems is being severely challenged
by high failure rates. Current centralized Hard Disk
Drive (HDD) checkpointing results in overhead of 25\%
or more at petascale. Since systems become more
vulnerable as the node count keeps increasing, novel
techniques that enable fast and frequent checkpointing
are critical to the future exascale system
implementation. In this work, we first introduce one of
the emerging nonvolatile memory technologies,
Phase-Change Random Access Memory (PCRAM), as a proper
candidate of the fast checkpointing device. After a
thorough analysis of MPP systems, failure rates and
failure sources, we propose a PCRAM-based hybrid
local/global checkpointing mechanism which not only
provides a faster checkpoint storage, but also boosts
the effectiveness of other orthogonal techniques such
as incremental checkpointing and background
checkpointing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2011:EEM,
author = "Jianjun Li and Chenggang Wu and Wei-Chung Hsu",
title = "Efficient and effective misaligned data access
handling in a dynamic binary translation system",
journal = j-TACO,
volume = "8",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970388",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Binary Translation (BT) has been commonly used to
migrate application software across Instruction Set
Architectures (ISAs). Some architectures, such as X86,
allow Misaligned Data Accesses (MDAs), while most
modern architectures require natural data alignments.
In a binary translation system, where the source ISA
allows MDA and the target ISA does not, memory
operations must be carefully translated. Naive
translation may cause frequent misaligned data access
traps to occur at runtime on the target machine and
severely slow down the migrated application. This
article evaluates different approaches in handling MDA
in a binary translation system including how to
identify MDA candidates and how to translate such
memory instructions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Venkataramani:2011:DDS,
author = "Guru Venkataramani and Christopher J. Hughes and
Sanjeev Kumar and Milos Prvulovic",
title = "{DeFT}: Design space exploration for on-the-fly
detection of coherence misses",
journal = j-TACO,
volume = "8",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970389",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "While multicore processors promise large performance
benefits for parallel applications, writing these
applications is notoriously difficult. Tuning a
parallel application to achieve good performance, also
known as performance debugging, is often more
challenging than debugging the application for
correctness. Parallel programs have many
performance-related issues that are not seen in
sequential programs. An increase in cache misses is one
of the biggest challenges that programmers face. To
minimize these misses, programmers must not only
identify the source of the extra misses, but also
perform the tricky task of determining if the misses
are caused by interthread communication (i.e.,
coherence misses) and if so, whether they are caused by
true or false sharing (since the solutions for these
two are quite different).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hiser:2011:EIB,
author = "Jason D. Hiser and Daniel W. Williams and Wei Hu and
Jack W. Davidson and Jason Mars and Bruce R. Childers",
title = "Evaluating indirect branch handling mechanisms in
software dynamic translation systems",
journal = j-TACO,
volume = "8",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970390",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Software Dynamic Translation (SDT) is used for
instrumentation, optimization, security, and many other
uses. A major source of SDT overhead is the execution
of code to translate an indirect branch's target
address into the translated destination block's
address. This article discusses sources of Indirect
Branch (IB) overhead in SDT systems and evaluates
techniques for overhead reduction. Measurements using
SPEC CPU2000 show that the appropriate choice and
configuration of IB translation mechanisms can
significantly reduce the overhead. Further,
cross-architecture evaluation of these mechanisms
reveals that the most efficient implementation and
configuration can be highly dependent on the
architecture implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2011:HAM,
author = "Xi E. Chen and Tor M. Aamodt",
title = "Hybrid analytical modeling of pending cache hits, data
prefetching, and {MSHRs}",
journal = j-TACO,
volume = "8",
number = "3",
pages = "10:1--10:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019609",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes techniques to predict the
performance impact of pending cache hits, hardware
prefetching, and miss status holding register resources
on superscalar microprocessors using hybrid analytical
models. The proposed models focus on timeliness of
pending hits and prefetches and account for a limited
number of MSHRs. They improve modeling accuracy of
pending hits by 3.9{\times} and when modeling data
prefetching, a limited number of MSHRs, or both, these
techniques result in average errors of 9.5\% to 17.8\%.
The impact of non-uniform DRAM memory latency is shown
to be approximated well by using a moving average of
memory access latency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kleanthous:2011:CMD,
author = "Marios Kleanthous and Yiannakis Sazeides",
title = "{CATCH}: a mechanism for dynamically detecting
cache-content-duplication in instruction caches",
journal = j-TACO,
volume = "8",
number = "3",
pages = "11:1--11:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019610",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache-content-duplication (CCD) occurs when there is a
miss for a block in a cache and the entire content of
the missed block is already in the cache in a block
with a different tag. Caches aware of
content-duplication can have lower miss penalty by
fetching, on a miss to a duplicate block, directly from
the cache instead of accessing lower in the memory
hierarchy, and can have lower miss rates by allowing
only blocks with unique content to enter a cache. This
work examines the potential of CCD for instruction
caches. We show that CCD is a frequent phenomenon and
that an idealized duplication-detection mechanism for
instruction caches has the potential to increase
performance of an out-of-order processor, with a 16KB,
8-way, 8 instructions per block instruction cache,
often by more than 10\% and up to 36\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vandierendonck:2011:MSR,
author = "Hans Vandierendonck and Andr{\'e} Seznec",
title = "Managing {SMT} resource usage through speculative
instruction window weighting",
journal = j-TACO,
volume = "8",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019611",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simultaneous multithreading processors dynamically
share processor resources between multiple threads. In
general, shared SMT resources may be managed
explicitly, for instance, by dynamically setting queue
occupation bounds for each thread as in the DCRA and
Hill-Climbing policies. Alternatively, resources may be
managed implicitly; that is, resource usage is
controlled by placing the desired instruction mix in
the resources. In this case, the main resource
management tool is the instruction fetch policy which
must predict the behavior of each thread (branch
mispredictions, long-latency loads, etc.) as it fetches
instructions. In this article, we present the use of
Speculative Instruction Window Weighting (SIWW) to
bridge the gap between implicit and explicit SMT fetch
policies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2011:PGS,
author = "Po-Han Wang and Chia-Lin Yang and Yen-Ming Chen and
Yu-Jung Cheng",
title = "Power gating strategies on {GPUs}",
journal = j-TACO,
volume = "8",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019612",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As technology continues to shrink, reducing leakage is
critical to achieving energy efficiency. Previous
studies on low-power GPUs (Graphics Processing Units)
focused on techniques for dynamic power reduction, such
as DVFS (Dynamic Voltage and Frequency Scaling) and
clock gating. In this paper, we explore the potential
of adopting architecture-level power gating techniques
for leakage reduction on GPUs. We propose three
strategies for applying power gating on different
modules in GPUs. The Predictive Shader Shutdown
technique exploits workload variation across frames to
eliminate leakage in shader clusters. Deferred Geometry
Pipeline seeks to minimize leakage in fixed-function
geometry units by utilizing an imbalance between
geometry and fragment computation across batches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Feng:2011:DAD,
author = "Min Feng and Chen Tian and Changhui Lin and Rajiv
Gupta",
title = "Dynamic access distance driven cache replacement",
journal = j-TACO,
volume = "8",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019613",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we propose a new cache replacement
policy that makes the replacement decision based on the
reuse information of the cache lines and the requested
data. We present the architectural support and evaluate
the performance of our approach using SPEC benchmarks.
We also develop two reuse information predictors: a
profile-based static predictor and a runtime predictor.
The applicability of each predictor is discussed in
this paper. We further extend our reuse information
predictors so that the cache can adaptively choose
between the reuse information based replacement policy
and an approximation of LRU policy. According to the
experimental results, our adaptive reuse information
based replacement policy performs either better than or
close to the LRU policy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Samih:2011:EPP,
author = "Ahmad Samih and Yan Solihin and Anil Krishna",
title = "Evaluating placement policies for managing capacity
sharing in {CMP} architectures with private caches",
journal = j-TACO,
volume = "8",
number = "3",
pages = "15:1--15:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019614",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Chip Multiprocessors (CMP) with distributed L2 caches
suffer from a cache fragmentation problem; some caches
may be overutilized while others may be underutilized.
To avoid such fragmentation, researchers have proposed
capacity sharing mechanisms where applications that
need additional cache space can place their victim
blocks in remote caches. However, we found that only
allowing victim blocks to be placed on remote caches
tends to cause a high number of remote cache hits
relative to local cache hits. In this article, we show
that many of the remote cache hits can be converted
into local cache hits if we allow newly fetched blocks
to be selectively placed directly in a remote cache,
rather than in the local cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yeh:2011:MPP,
author = "Chang-Ching Yeh and Kuei-Chung Chang and Tien-Fu Chen
and Chingwei Yeh",
title = "Maintaining performance on power gating of
microprocessor functional units by using a predictive
pre-wakeup strategy",
journal = j-TACO,
volume = "8",
number = "3",
pages = "16:1--16:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019615",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Power gating is an effective technique for reducing
leakage power in deep submicron CMOS technology.
Microarchitectural techniques for power gating of
functional units have been developed by detecting
suitable idle regions and turning them off to reduce
leakage energy consumption; however, wakeup of
functional units is needed when instructions are ready
for execution such that wakeup overhead is naturally
incurred. This study presents time-based power gating
with reference pre-wakeup (PGRP), a novel predictive
strategy that detects suitable idle periods for power
gating and then enables pre-wakeup of needed functional
units for avoiding wakeup overhead. The key insight is
that most wakeups are repeated due to program
locality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2011:DDE,
author = "Hyunjin Lee and Sangyeun Cho and Bruce R. Childers",
title = "{DEFCAM}: a design and evaluation framework for
defect-tolerant cache memories",
journal = j-TACO,
volume = "8",
number = "3",
pages = "17:1--17:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019616",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Advances in deep submicron technology call for a
careful review of existing cache designs and design
practices in terms of yield, area, and performance.
This article presents a Design and Evaluation Framework
for defect-tolerant Cache Memories (DEFCAM), which
enables processor architects to consider yield, area,
and performance together in a unified framework. Since
there is a complex, changing trade-off among these
metrics depending on the technology, the cache
organization, and the yield enhancement scheme
employed, such a design flow is invaluable to processor
architects when they assess a design and explore the
design space quickly at an early stage. We develop a
complete framework supporting the proposed DEFCAM
design flow, from injecting defects into a wafer to
evaluating program performance of individual processors
on the wafer.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stenstrom:2012:ISI,
author = "Per Stenstr{\"o}m and Koen {De Bosschere}",
title = "Introduction to the special issue on high-performance
and embedded architectures and compilers",
journal = j-TACO,
volume = "8",
number = "4",
pages = "18:1--18:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Albericio:2012:ALC,
author = "Jorge Albericio and Rub{\'e}n Gran and Pablo
Ib{\'a}{\~n}ez and V{\'\i}ctor Vi{\~n}als and Jose
Mar{\'\i}a Llaber{\'\i}a",
title = "{ABS}: a low-cost adaptive controller for prefetching
in a banked shared last-level cache",
journal = j-TACO,
volume = "8",
number = "4",
pages = "19:1--19:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086698",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware data prefetch is a very well known technique
for hiding memory latencies. However, in a multicore
system fitted with a shared Last-Level Cache (LLC),
prefetch induced by a core consumes common resources
such as shared cache space and main memory bandwidth.
This may degrade the performance of other cores and
even the overall system performance unless the prefetch
aggressiveness of each core is controlled from a system
standpoint. On the other hand, LLCs in commercial chip
multiprocessors are more and more frequently organized
in independent banks. In this contribution, we target
for the first time prefetch in a banked LLC
organization and propose ABS, a low-cost controller
with a hill-climbing approach that runs stand-alone at
each LLC bank without requiring inter-bank
communication.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bayrak:2012:AII,
author = "Ali Galip Bayrak and Nikola Velickovic and Paolo Ienne
and Wayne Burleson",
title = "An architecture-independent instruction shuffler to
protect against side-channel attacks",
journal = j-TACO,
volume = "8",
number = "4",
pages = "20:1--20:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086699",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Embedded cryptographic systems, such as smart cards,
require secure implementations that are robust to a
variety of low-level attacks. Side-Channel Attacks
(SCA) exploit the information such as power
consumption, electromagnetic radiation and acoustic
leaking through the device to uncover the secret
information. Attackers can mount successful attacks
with very modest resources in a short time period.
Therefore, many methods have been proposed to increase
the security against SCA. Randomizing the execution
order of the instructions that are independent, i.e.,
random shuffling, is one of the most popular among
them. Implementing instruction shuffling in software is
either implementation specific or has a significant
performance or code size overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Demme:2012:AGC,
author = "John Demme and Simha Sethumadhavan",
title = "Approximate graph clustering for program
characterization",
journal = j-TACO,
volume = "8",
number = "4",
pages = "21:1--21:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086700",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "An important aspect of system optimization research is
the discovery of program traits or behaviors. In this
paper, we present an automated method of program
characterization which is able to examine and cluster
program graphs, i.e., dynamic data graphs or control
flow graphs. Our novel approximate graph clustering
technology allows users to find groups of program
fragments which contain similar code idioms or patterns
in data reuse, control flow, and context. Patterns of
this nature have several potential applications
including development of new static or dynamic
optimizations to be implemented in software or in
hardware. For the SPEC CPU 2006 suite of benchmarks,
our results show that approximate graph clustering is
effective at grouping behaviorally similar functions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pricopi:2012:BPH,
author = "Mihai Pricopi and Tulika Mitra",
title = "{Bahurupi}: a polymorphic heterogeneous multi-core
architecture",
journal = j-TACO,
volume = "8",
number = "4",
pages = "22:1--22:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086701",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Computing systems have made an irreversible transition
towards parallel architectures with the emergence of
multi-cores. Moreover, power and thermal limits in
embedded systems mandate the deployment of many simpler
cores rather than a few complex cores on chip. Consumer
electronic devices, on the other hand, need to support
an ever-changing set of diverse applications with
varying performance demands. While some applications
can benefit from thread-level parallelism offered by
multi-core solutions, there still exist a large number
of applications with substantial amount of sequential
code. The sequential programs suffer from limited
exploitation of instruction-level parallelism in simple
cores. We propose a reconfigurable multi-core
architecture, called Bahurupi, that can successfully
reconcile the conflicting demands of instruction-level
and thread-level parallelism.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cleemput:2012:CMT,
author = "Jeroen V. Cleemput and Bart Coppens and Bjorn {De
Sutter}",
title = "Compiler mitigations for time attacks on modern x86
processors",
journal = j-TACO,
volume = "8",
number = "4",
pages = "23:1--23:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086702",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This paper studies and evaluates the extent to which
automated compiler techniques can defend against
timing-based side channel attacks on modern x86
processors. We study how modern x86 processors can leak
timing information through side channels that relate to
data flow. We study the efficiency, effectiveness,
portability, predictability and sensitivity of several
mitigating code transformations that eliminate or
minimize key-dependent execution time variations.
Furthermore, we discuss the extent to which compiler
backends are a suitable tool to provide automated
support for the proposed mitigations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mccandless:2012:CTI,
author = "Jason Mccandless and David Gregg",
title = "Compiler techniques to improve dynamic branch
prediction for indirect jump and call instructions",
journal = j-TACO,
volume = "8",
number = "4",
pages = "24:1--24:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086703",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Indirect jump instructions are used to implement
multiway branch statements and virtual function calls
in object-oriented languages. Branch behavior can have
significant impact on program performance, but
fortunately hardware predictors can alleviate much of
the risk. Modern processors include indirect branch
predictors which use part of the target address to
update a global history. We present a code generation
technique to maximize the branch history information
available to the predictor. We implement our
optimization as an assembly language transformation,
and evaluate it for SPEC benchmarks and interpreters
using simulated and real hardware, showing indirect
branch misprediction decreases.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Garcia-Guirado:2012:DDA,
author = "Antonio Garc{\'\i}a-Guirado and Ricardo
Fern{\'a}ndez-Pascual and Alberto Ros and Jos{\'e} M.
Garc{\'\i}a",
title = "{DAPSCO}: Distance-aware partially shared cache
organization",
journal = j-TACO,
volume = "8",
number = "4",
pages = "25:1--25:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086704",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many-core tiled CMP proposals often assume a partially
shared last level cache (LLC) since this provides a
good compromise between access latency and cache
utilization. In this paper, we propose a novel way to
map memory addresses to LLC banks that takes into
account the average distance between the banks and the
tiles that access them. Contrary to traditional
approaches, our mapping does not group the tiles in
clusters within which all the cores access the same
bank for the same addresses. Instead, two neighboring
cores access different sets of banks minimizing the
average distance travelled by the cache requests.
Results for a 64-core CMP show that our proposal
improves both execution time and the energy consumed by
the network by 13\% when compared to a traditional
mapping.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2012:FSS,
author = "Zhenjiang Wang and Chenggang Wu and Pen-Chung Yew and
Jianjun Li and Di Xu",
title = "On-the-fly structure splitting for heap objects",
journal = j-TACO,
volume = "8",
number = "4",
pages = "26:1--26:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086705",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of multicore systems, the gap between
processor speed and memory latency has grown worse
because of their complex interconnect. Sophisticated
techniques are needed more than ever to improve an
application's spatial and temporal locality. This paper
describes an optimization that aims to improve heap
data layout by structure-splitting. It also provides
runtime address checking by piggybacking on the
existing page protection mechanism to guarantee the
correctness of such optimization that has eluded many
previous attempts due to safety concerns. The technique
can be applied to both sequential and parallel programs
at either compile time or runtime. However, we focus
primarily on sequential programs (i.e., single-threaded
programs) at runtime in this paper.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Das:2012:ELC,
author = "Dibyendu Das and B. Dupont {De Dinechin} and
Ramakrishna Upadrasta",
title = "Efficient liveness computation using merge sets and
{DJ}-graphs",
journal = j-TACO,
volume = "8",
number = "4",
pages = "27:1--27:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086706",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this work we devise an efficient algorithm that
computes the liveness information of program variables.
The algorithm employs SSA form and DJ-graphs as
representation to build Merge sets. The Merge set of
node n, M(n) is based on the structure of the Control
Flow Graph (CFG) and consists of all nodes where a
{\phi}-function needs to be placed, if a definition of
a variable appears in n. The merge sets of a CFG can be
computed using DJ-graphs without prior knowledge of how
the variables are used and defined. Later, we can
answer the liveness query (as a part of other
optimization or analysis phase) by utilizing the
knowledge of the use/def of variables, the dominator
tree and the pre-computed merge sets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Patsilaras:2012:EEM,
author = "George Patsilaras and Niket K. Choudhary and James
Tuck",
title = "Efficiently exploiting memory level parallelism on
asymmetric coupled cores in the dark silicon era",
journal = j-TACO,
volume = "8",
number = "4",
pages = "28:1--28:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086707",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Extracting high memory-level parallelism (MLP) is
essential for speeding up single-threaded applications
which are memory bound. At the same time, the projected
amount of dark silicon (the fraction of the chip
powered off) on a chip is growing. Hence, Asymmetric
Multicore Processors (AMP) offer a unique opportunity
to integrate many types of cores, each powered at
different times, in order to optimize for different
regions of execution. In this work, we quantify the
potential for exploiting core customization to speedup
programs during regions of high MLP. Based on a careful
design space exploration, we discover that an AMP that
includes a narrow and fast specialized core has the
potential to efficiently exploit MLP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Malits:2012:ELG,
author = "Roman Malits and Evgeny Bolotin and Avinoam Kolodny
and Avi Mendelson",
title = "Exploring the limits of {GPGPU} scheduling in control
flow bound applications",
journal = j-TACO,
volume = "8",
number = "4",
pages = "29:1--29:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086708",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPGPUs are optimized for graphics, for that reason the
hardware is optimized for massively data parallel
applications characterized by predictable memory access
patterns and little control flow. For such
applications' e.g., matrix multiplication, GPGPU based
system can achieve very high performance. However, many
general purpose data parallel applications are
characterized as having intensive control flow and
unpredictable memory access patterns. Optimizing the
code in such problems for current hardware is often
ineffective and even impractical since it exhibits low
hardware utilization leading to relatively low
performance. This work tracks the root causes of
execution inefficacies when running control flow
intensive CUDA applications on NVIDIA GPGPU hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Orosa:2012:FIF,
author = "Lois Orosa and Elisardo Antelo and Javier D.
Bruguera",
title = "{FlexSig}: {Implementing} flexible hardware
signatures",
journal = j-TACO,
volume = "8",
number = "4",
pages = "30:1--30:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086709",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of chip multiprocessors, new
techniques have been developed to make parallel
programming easier and more reliable. New parallel
programming paradigms and new methods of making the
execution of programs more efficient and more reliable
have been developed. Usually, these improvements
require hardware support to avoid a system slowdown.
Signatures based on Bloom filters are widely used as
hardware support for parallel programming in chip
multiprocessors. Signatures are used in Transactional
Memory, thread-level speculation, parallel debugging,
deterministic replay and other tools and applications.
The main limitation of hardware signatures is the lack
of flexibility: if signatures are designed with a given
configuration, tailored to the requirements of a
specific tool or application, it is likely that they do
not fit well for other different requirements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Titos-Gil:2012:HTM,
author = "Ruben Titos-Gil and Manuel E. Acacio and Jose M.
Garcia and Tim Harris and Adrian Cristal and Osman
Unsal and Ibrahim Hur and Mateo Valero",
title = "Hardware transactional memory with software-defined
conflicts",
journal = j-TACO,
volume = "8",
number = "4",
pages = "31:1--31:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086710",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this paper we investigate the benefits of turning
the concept of transactional conflict from its
traditionally fixed definition into a variable one that
can be dynamically controlled in software. We propose
the extension of the atomic language construct with an
attribute that specifies the definition of conflict, so
that programmers can write code which adjusts what
kinds of conflicts are to be detected, relaxing or
tightening the conditions according to the forms of
interference that can be tolerated by a particular
algorithm. Using this performance-motivated construct,
specific conflict information can be associated with
portions of code, as each transaction is provided with
a local definition that applies while it executes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2012:IPN,
author = "Yongjoo Kim and Jongeun Lee and Toan X. Mai and
Yunheung Paek",
title = "Improving performance of nested loops on
reconfigurable array processors",
journal = j-TACO,
volume = "8",
number = "4",
pages = "32:1--32:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086711",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Pipelining algorithms are typically concerned with
improving only the steady-state performance, or the
kernel time. The pipeline setup time happens only once
and therefore can be negligible compared to the kernel
time. However, for Coarse-Grained Reconfigurable
Architectures (CGRAs) used as a coprocessor to a main
processor, pipeline setup can take much longer due to
the communication delay between the two processors, and
can become significant if it is repeated in an outer
loop of a loop nest. In this paper we evaluate the
overhead of such non-kernel execution times when
mapping nested loops for CGRAs, and propose a novel
architecture-compiler cooperative scheme to reduce the
overhead, while also minimizing the number of extra
configurations required.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Purnaprajna:2012:MWI,
author = "Madhura Purnaprajna and Paolo Ienne",
title = "Making wide-issue {VLIW} processors viable on
{FPGAs}",
journal = j-TACO,
volume = "8",
number = "4",
pages = "33:1--33:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Soft and highly-customized processors are emerging as
a common way to efficiently control large amount of
computing resources available on FPGAs. Yet, some
processor architectures of choice for DSP and media
applications, such as wide-issue VLIW processors,
remain impractical: the multi-ported register file
makes a very inefficient use of the resources in the
FPGA fabric. This paper proposes modifications to
existing FPGAs to make soft-VLIW processor viable. We
introduce an embedded multi-ported RAM that can be
customized to match the issue-width of VLIW processors.
To ascertain the benefits of this approach, we map an
extensible VLIW processor onto a standard FPGA from
Xilinx.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Radojkovic:2012:EIS,
author = "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud
Grasset and Eduardo Qui{\~n}ones and Sami Yehia and
Francisco J. Cazorla",
title = "On the evaluation of the impact of shared resources in
multithreaded {COTS} processors in time-critical
environments",
journal = j-TACO,
volume = "8",
number = "4",
pages = "34:1--34:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Commercial Off-The-Shelf (COTS) processors are now
commonly used in real-time embedded systems. The
characteristics of these processors fulfill system
requirements in terms of time-to-market, low cost, and
high performance-per-watt ratio. However, multithreaded
(MT) processors are still not widely used in real-time
systems because the timing analysis is too complex. In
MT processors, simultaneously-running tasks share and
compete for processor resources, so the timing analysis
has to estimate the possible impact that the inter-task
interferences have on the execution time of the
applications. In this paper, we propose a method that
quantifies the slowdown that simultaneously-running
tasks may experience due to collision in shared
processor resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Domnitser:2012:NMC,
author = "Leonid Domnitser and Aamer Jaleel and Jason Loew and
Nael Abu-Ghazaleh and Dmitry Ponomarev",
title = "Non-monopolizable caches: Low-complexity mitigation of
cache side channel attacks",
journal = j-TACO,
volume = "8",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose a flexibly-partitioned cache design that
either drastically weakens or completely eliminates
cache-based side channel attacks. The proposed
Non-Monopolizable (NoMo) cache dynamically reserves
cache lines for active threads and prevents other
co-executing threads from evicting reserved lines.
Unreserved lines remain available for dynamic sharing
among threads. NoMo requires only simple modifications
to the cache replacement logic, making it
straightforward to adopt. It requires no software
support enabling it to automatically protect
pre-existing binaries. NoMo results in performance
degradation of about 1\% on average. We demonstrate
that NoMo can provide strong security guarantees for
the AES and Blowfish encryption algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rico:2012:SLS,
author = "Alejandro Rico and Felipe Cabarcas and Carlos
Villavieja and Milan Pavlovic and Augusto Vega and Yoav
Etsion and Alex Ramirez and Mateo Valero",
title = "On the simulation of large-scale architectures using
multiple application abstraction levels",
journal = j-TACO,
volume = "8",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086715",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simulation is a key tool for computer architecture
research. In particular, cycle-accurate simulators are
extremely important for microarchitecture exploration
and detailed design decisions, but they are slow and,
so, not suitable for simulating large-scale
architectures, nor are they meant for this. Moreover,
microarchitecture design decisions are irrelevant, or
even misleading, for early processor design stages and
high-level explorations. This allows one to raise the
abstraction level of the simulated architecture, and
also the application abstraction level, as it does not
necessarily have to be represented as an instruction
stream. In this paper we introduce a definition of
different application abstraction levels, and how these
are employed in TaskSim, a multi-core architecture
simulator, to provide several architecture modeling
abstractions, and simulate large-scale architectures
with hundreds of cores.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Saidi:2012:OED,
author = "Selma Saidi and Pranav Tendulkar and Thierry Lepley
and Oded Maler",
title = "Optimizing explicit data transfers for data parallel
applications on the {Cell} architecture",
journal = j-TACO,
volume = "8",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086716",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this paper we investigate a general approach to
automate some deployment decisions for a certain class
of applications on multi-core computers. We consider
data-parallelizable programs that use the well-known
double buffering technique to bring the data from the
off-chip slow memory to the local memory of the cores
via a DMA (direct memory access) mechanism. Based on
the computation time and size of elementary data items
as well as DMA characteristics, we derive optimal and
near optimal values for the number of blocks that
should be clustered in a single DMA command. We then
extend the results to the case where a computation for
one data item needs some data in its neighborhood.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Feng:2012:PPL,
author = "Min Feng and Changhui Lin and Rajiv Gupta",
title = "{PLDS}: Partitioning linked data structures for
parallelism",
journal = j-TACO,
volume = "8",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recently, parallelization of computations in the
presence of dynamic data structures has shown promising
potential. In this paper, we present PLDS, a system for
easily expressing and efficiently exploiting
parallelism in computations that are based on dynamic
linked data structures. PLDS improves the execution
efficiency by providing support for data partitioning
and then distributing computation across threads based
on the partitioning. Such computations often require
the use of speculation to exploit dynamic parallelism.
PLDS supports a conditional speculation mechanism that
reduces the cost of speculation. PLDS can be employed
in the context of different forms of parallelism, which
to cover a wide range of parallel applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pradelle:2012:PPB,
author = "Benoit Pradelle and Alain Ketterlin and Philippe
Clauss",
title = "Polyhedral parallelization of binary code",
journal = j-TACO,
volume = "8",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086718",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many automatic software parallelization systems have
been proposed in the past decades, but most of them are
dedicated to source-to-source transformations. This
paper shows that parallelizing executable programs is
feasible, even if they require complex transformations,
and in effect decouples parallelization from
compilation, for example, for closed-source or legacy
software, where binary code is the only available
representation. We propose an automatic parallelizer,
which is able to perform advanced parallelization on
binary code. It first parses the binary code and
extracts high-level information. From this information,
a C program is generated. This program captures only a
subset of the program semantics, namely, loops and
memory accesses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dong:2012:RAE,
author = "Yaozu Dong and Yu Chen and Zhenhao Pan and Jinquan Dai
and Yunhong Jiang",
title = "{ReNIC}: Architectural extension to {SR-IOV} {I/O}
virtualization for efficient replication",
journal = j-TACO,
volume = "8",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086719",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Virtualization is gaining popularity in cloud
computing and has become the key enabling technology in
cloud infrastructure. By replicating the virtual server
state to multiple independent platforms, virtualization
improves the reliability and availability of cloud
systems. Unfortunately, existing Virtual Machine (VM)
replication solutions were designed only for software
virtualized I/O, which suffers from large performance
and scalability overheads. Although hardware-assisted
I/O virtualization (such as SR-IOV) can achieve close
to native performance and very good scalability, they
cannot be properly replicated across different physical
machines due to architectural limitations (such as lack
of efficient device state read/write, buffering
outbound packets, etc.) .",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bruintjes:2012:SLA,
author = "Tom M. Bruintjes and Karel H. G. Walters and Sabih H.
Gerez and Bert Molenkamp and Gerard J. M. Smit",
title = "{Sabrewing}: a lightweight architecture for combined
floating-point and integer arithmetic",
journal = j-TACO,
volume = "8",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086720",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In spite of the fact that floating-point arithmetic is
costly in terms of silicon area, the joint design of
hardware for floating-point and integer arithmetic is
seldom considered. While components like multipliers
and adders can potentially be shared, floating-point
and integer units in contemporary processors are
practically disjoint. This work presents a new
architecture which tightly integrates floating-point
and integer arithmetic in a single datapath. It is
mainly intended for use in low-power embedded digital
signal processors and therefore the following design
constraints were important: limited use of pipelining
for the convenience of the compiler; maintaining
compatibility with existing technology; minimal area
and power consumption for applicability in embedded
systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kicherer:2012:SPA,
author = "Mario Kicherer and Fabian Nowak and Rainer Buchty and
Wolfgang Karl",
title = "Seamlessly portable applications: Managing the
diversity of modern heterogeneous systems",
journal = j-TACO,
volume = "8",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086721",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Nowadays, many possible configurations of
heterogeneous systems exist, posing several new
challenges to application development: different types
of processing units usually require individual
programming models with dedicated runtime systems and
accompanying libraries. If these are absent on an
end-user system, e.g. because the respective hardware
is not present, an application linked against these
will break. This handicaps portability of applications
being developed on one system and executed on other,
differently configured heterogeneous systems. Moreover,
the individual profit of different processing units is
normally not known in advance. In this work, we propose
a technique to effectively decouple applications from
their accelerator-specific parts, respectively code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Premillieu:2012:SSR,
author = "Nathanael Premillieu and Andre Seznec",
title = "{SYRANT}: {SYmmetric Resource Allocation on Not-taken
and Taken} paths",
journal = j-TACO,
volume = "8",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086722",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the multicore era, achieving ultimate single
process performance is still an issue e.g. for single
process workload or for sequential sections in parallel
applications. Unfortunately, despite tremendous
research effort on branch prediction, substantial
performance potential is still wasted due to branch
mispredictions. On a branch misprediction resolution,
instruction treatment on the wrong path is essentially
thrown away. However, in most cases after a conditional
branch, the taken and the not-taken paths of execution
merge after a few instructions. Instructions that
follow the reconvergence point are executed whatever
the branch outcome is. We present SYRANT (SYmmetric
Resource Allocation on Not-taken and Taken paths), a
new technique for exploiting control independence.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hasenplaugh:2012:GBC,
author = "William Hasenplaugh and Pritpal S. Ahuja and Aamer
Jaleel and Simon {Steely, Jr.} and Joel Emer",
title = "The gradient-based cache partitioning algorithm",
journal = j-TACO,
volume = "8",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086723",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This paper addresses the problem of partitioning a
cache between multiple concurrent threads and in the
presence of hardware prefetching. Cache replacement
designed to preserve temporal locality (e.g., LRU) will
allocate cache resources proportional to the miss-rate
of each competing thread irrespective of whether the
cache space will be utilized [Qureshi and Patt 2006].
This is clearly suboptimal as applications vary
dramatically in their use of recently accessed data. We
address this problem by partitioning a shared cache
such that a global goodness metric is optimized. This
paper introduces the Gradient-based Cache Partitioning
Algorithm (GPA), whose variants optimize either
hitrate, total instructions per cycle (IPC) or a
weighted IPC metric designed to enforce Quality of
Service (QoS) [Iyer 2004].",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lira:2012:MPA,
author = "Javier Lira and Timothy M. Jones and Carlos Molina and
Antonio Gonz{\'a}lez",
title = "The migration prefetcher: Anticipating data promotion
in dynamic {NUCA} caches",
journal = j-TACO,
volume = "8",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086724",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The exponential increase in multicore processor (CMP)
cache sizes accompanied by growing on-chip wire delays
make it difficult to implement traditional caches with
a single, uniform access latency. Non-Uniform Cache
Architecture (NUCA) designs have been proposed to
address this problem. A NUCA divides the whole cache
memory into smaller banks and allows banks nearer a
processor core to have lower access latencies than
those further away, thus mitigating the effects of the
cache's internal wires. Determining the best placement
for data in the NUCA cache at any particular moment
during program execution is crucial for exploiting the
benefits that this architecture provides.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pusukuri:2012:TTD,
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
Bhuyan",
title = "Thread Tranquilizer: Dynamically reducing performance
variation",
journal = j-TACO,
volume = "8",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086725",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To realize the performance potential of multicore
systems, we must effectively manage the interactions
between memory reference behavior and the operating
system policies for thread scheduling and migration
decisions. We observe that these interactions lead to
significant variations in the performance of a given
application, from one execution to the next, even when
the program input remains unchanged and no other
applications are being run on the system. Our
experiments with multithreaded programs, including the
TATP database application, SPECjbb2005, and a subset of
PARSEC and SPEC OMP programs, on a 24-core Dell
PowerEdge R905 server running OpenSolaris confirms the
above observation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2012:TPB,
author = "Dongsong Zhang and Deke Guo and Fangyuan Chen and Fei
Wu and Tong Wu and Ting Cao and Shiyao Jin",
title = "{TL}-plane-based multi-core energy-efficient real-time
scheduling algorithm for sporadic tasks",
journal = j-TACO,
volume = "8",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086726",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the energy consumption of multi-core systems
becomes increasingly prominent, it's a challenge to
design an energy-efficient real-time scheduling
algorithm in multi-core systems for reducing the system
energy consumption while guaranteeing the feasibility
of real-time tasks. In this paper, we focus on
multi-core processors, with the global Dynamic Voltage
Frequency Scaling (DVFS) and Dynamic Power Management
(DPM) technologies. In this setting, we propose an
energy-efficient real-time scheduling algorithm, the
Time Local remaining execution plane based Dynamic
Voltage Frequency Scaling (TL-DVFS). TL-DVFS utilizes
the concept of Time Local remaining execution (TL)
plane to dynamically scale the voltage and frequency of
a processor at the initial time of each TL plane as
well as at the release time of a sporadic task in each
TL plane.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lyons:2012:ASS,
author = "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
and David Brooks",
title = "The accelerator store: a shared memory framework for
accelerator-based systems",
journal = j-TACO,
volume = "8",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086727",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This paper presents the many-accelerator architecture,
a design approach combining the scalability of
homogeneous multi-core architectures and
system-on-chip's high performance and power-efficient
hardware accelerators. In preparation for systems
containing tens or hundreds of accelerators, we
characterize a diverse pool of accelerators and find
each contains significant amounts of SRAM memory (up to
90\% of their area). We take advantage of this
discovery and introduce the accelerator store, a
scalable architectural component to minimize
accelerator area by sharing its memories between
accelerators. We evaluate the accelerator store for two
applications and find significant system area
reductions (30\%) in exchange for small overheads (2\%
performance, 0\%--8\% energy).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Orozco:2012:THT,
author = "Daniel Orozco and Elkin Garcia and Rishi Khan and
Kelly Livingston and Guang R. Gao",
title = "Toward high-throughput algorithms on many-core
architectures",
journal = j-TACO,
volume = "8",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086728",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Advanced many-core CPU chips already have a few
hundreds of processing cores (e.g., 160 cores in an IBM
Cyclops-64 chip) and more and more processing cores
become available as computer architecture progresses.
The underlying runtime systems of such architectures
need to efficiently serve hundreds of processors at the
same time, requiring all basic data structures within
the runtime to maintain unprecedented throughput. In
this paper, we analyze the throughput requirements that
must be met by algorithms in runtime systems to be able
to handle hundreds of simultaneous operations in real
time. We reach a surprising conclusion: Many
traditional algorithm techniques are poorly suited for
highly parallel computing environments because of their
low throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stock:2012:UML,
author = "Kevin Stock and Louis-No{\"e}l Pouchet and P.
Sadayappan",
title = "Using machine learning to improve automatic
vectorization",
journal = j-TACO,
volume = "8",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086729",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Automatic vectorization is critical to enhancing
performance of compute-intensive programs on modern
processors. However, there is much room for improvement
over the auto-vectorization capabilities of current
production compilers through careful vector-code
synthesis that utilizes a variety of loop
transformations (e.g., unroll-and-jam, interchange,
etc.) . As the set of transformations considered is
increased, the selection of the most effective
combination of transformations becomes a significant
challenge: Currently used cost models in vectorizing
compilers are often unable to identify the best
choices. In this paper, we address this problem using
machine learning models to predict the performance of
SIMD codes. In contrast to existing approaches that
have used high-level features of the program, we
develop machine learning models based on features
extracted from the generated assembly code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Therdsteerasukdi:2012:URI,
author = "Kanit Therdsteerasukdi and Gyungsu Byun and Jason Cong
and M. Frank Chang and Glenn Reinman",
title = "Utilizing {RF-I} and intelligent scheduling for better
throughput\slash watt in a mobile {GPU} memory system",
journal = j-TACO,
volume = "8",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086730",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Smartphones and tablets are becoming more and more
powerful, replacing desktops and laptops as the users'
main computing system. As these systems support higher
and higher resolutions with more complex 3D graphics, a
high-throughput and low-power memory system is
essential for the mobile GPU. In this article, we
propose to improve throughput/watt in a mobile GPU
memory system by using intelligent scheduling to reduce
power and multi-band radio frequency interconnect
(MRF-I) to offset any throughput degradation caused by
our intelligent scheduling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ryckbosch:2012:VSM,
author = "Frederick Ryckbosch and Stijn Polfliet and Lieven
Eeckhout",
title = "{VSim}: Simulating multi-server setups at near native
hardware speed",
journal = j-TACO,
volume = "8",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086731",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simulating contemporary computer systems is a
challenging endeavor, especially when it comes to
simulating high-end setups involving multiple servers.
The simulation environment needs to run complete
software stacks, including operating systems,
middleware, and application software, and it needs to
simulate network and disk activity next to CPU
performance. In addition, it needs the ability to scale
out to a large number of server nodes while attaining
good accuracy and reasonable simulation speeds. This
paper presents VSim, a novel simulation methodology for
multi-server systems. VSim leverages virtualization
technology for simulating a target system on a host
system. VSim controls CPU, network and disk performance
on the host, and it gives the illusion to the software
stack to run on a target system through time
dilation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2012:WAP,
author = "Miao Zhou and Yu Du and Bruce Childers and Rami Melhem
and Daniel Moss{\'e}",
title = "Writeback-aware partitioning and replacement for
last-level caches in phase change main memory systems",
journal = j-TACO,
volume = "8",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086732",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase-Change Memory (PCM) has emerged as a promising
low-power main memory candidate to replace DRAM. The
main problems of PCM are that writes are much slower
and more power hungry than reads, write bandwidth is
much lower than read bandwidth, and limited write
endurance. Adding an extra layer of cache, which is
logically the last-level cache (LLC), can mitigate the
drawbacks of PCM. However, writebacks from the LLC
might (a) overwhelm the limited PCM write bandwidth and
stall the application, (b) shorten lifetime, and (c)
increase energy consumption. Cache partitioning and
replacement schemes are important to achieve high
throughput for multi-core systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2012:TMA,
author = "Qingping Wang and Sameer Kulkarni and John Cavazos and
Michael Spear",
title = "A transactional memory with automatic performance
tuning",
journal = j-TACO,
volume = "8",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086733",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A significant obstacle to the acceptance of
transactional memory (TM) in real-world parallel
programs is the abundance of substantially different TM
algorithms. Each TM algorithm appears well-suited to
certain workload characteristics, but the best choice
of algorithm is sensitive to program inputs, available
cores, and program phases. Furthermore, operating
system and hardware characteristics can affect which
algorithm is best, with tradeoffs changing across
iterations of a single ISA. This paper introduces
methods for constructing policies to dynamically select
the most appropriate TM algorithm based on static and
dynamic information. We leverage intraprocedural static
analysis to create a static profile of the
application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bogdanski:2012:SFC,
author = "Bartosz Bogdanski and Sven-Arne Reinemo and Frank Olaf
Sem-Jacobsen and Ernst Gunnar Gran",
title = "{sFtree}: a fully connected and deadlock-free
switch-to-switch routing algorithm for fat-trees",
journal = j-TACO,
volume = "8",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086734",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Existing fat-tree routing algorithms fully exploit the
path diversity of a fat-tree topology in the context of
compute node traffic, but they lack support for
deadlock-free and fully connected switch-to-switch
communication. Such support is crucial for efficient
system management, for example, in InfiniBand (IB)
systems. With the general increase in system management
capabilities found in modern InfiniBand switches, the
lack of deadlock-free switch-to-switch communication is
a problem for fat-tree-based IB installations because
management traffic might cause routing deadlocks that
bring the whole system down. This lack of deadlock-free
communication affects all system management and
diagnostic tools using LID routing. In this paper, we
propose the sFtree routing algorithm that guarantees
deadlock-free and fully connected switch-to-switch
communication in fat-trees while maintaining the
properties of the current fat-tree algorithm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ghandour:2012:LSB,
author = "Walid J. Ghandour and Haitham Akkary and Wes Masri",
title = "Leveraging Strength-Based Dynamic Information Flow
Analysis to Enhance Data Value Prediction",
journal = j-TACO,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133383",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Value prediction is a technique to increase
parallelism by attempting to overcome serialization
constraints caused by true data dependences. By
predicting the outcome of an instruction before it
executes, value prediction allows data dependent
instructions to issue and execute speculatively, hence
increasing parallelism when the prediction is correct.
In case of a misprediction, the execution is redone
with the corrected value. If the benefit from increased
parallelism outweighs the misprediction recovery
penalty, overall performance could be improved.
Enhancing performance with value prediction therefore
requires highly accurate prediction methods. Most
existing general value prediction techniques are local,
that is, future outputs of an instruction are predicted
based on outputs from previous executions of the same
instruction. In this article, we investigate leveraging
strength-based dynamic information flow analysis to
enhance data value prediction. We use dynamic
information flow analysis (DIFA) to determine when a
specific value predictor can perform well and even
outperform other predictors. We apply information
theory to mathematically prove the validity and
benefits of correlating value predictors. We also
introduce the concept of the linear value predictors, a
new technique that predicts a new value from another
one using a linear relation. We finally present a
variant of stride predictor that we call update stride.
We then conduct an empirical analysis using Pin, a
dynamic binary instrumentation tool, and DynFlow, a
dynamic information flow analysis tool, that we apply
to programs from the SPECjvm2008 and Siemens
benchmarks. Our empirical measurements support our
mathematical theory and allow us to make important
observations on the relation between predictability of
data values and information flow. Our analysis and
empirical results show that the values of a set of
selected variables can be predicted with a very high
accuracy, up to 100\%. Such prediction is based on the
previous history and/or the values of one or more other
source variables that have strong information flow into
the predicted variable. Using our selection criteria,
we show that a DIFA-directed predictor outperforms
hardware value prediction for all subject programs, and
sometimes by a significant margin. This was observed
even when using an ideal tagged hardware value
prediction table that does not suffer from aliasing or
capacity misses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2012:WPW,
author = "Jaekyu Lee and Hyesoon Kim and Richard Vuduc",
title = "When Prefetching Works, When It Doesn't, and Why",
journal = j-TACO,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133384",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In emerging and future high-end processor systems,
tolerating increasing cache miss latency and properly
managing memory bandwidth will be critical to achieving
high performance. Prefetching, in both hardware and
software, is among our most important available
techniques for doing so; yet, we claim that prefetching
is perhaps also the least well-understood. Thus, the
goal of this study is to develop a novel, foundational
understanding of both the benefits and limitations of
hardware and software prefetching. Our study includes:
source code-level analysis, to help in understanding
the practical strengths and weaknesses of compiler- and
software-based prefetching; a study of the synergistic
and antagonistic effects between software and hardware
prefetching; and an evaluation of hardware prefetching
training policies in the presence of software
prefetching requests. We use both simulation and
measurement on real systems. We find, for instance,
that although there are many opportunities for
compilers to prefetch much more aggressively than they
currently do, there is also a tangible risk of
interference with training existing hardware
prefetching mechanisms. Taken together, our
observations suggest new research directions for
cooperative hardware/software prefetching.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mazloom:2012:DTI,
author = "Bita Mazloom and Shashidhar Mysore and Mohit Tiwari
and Banit Agrawal and Tim Sherwood",
title = "Dataflow Tomography: Information Flow Tracking For
Understanding and Visualizing Full Systems",
journal = j-TACO,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133385",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "It is not uncommon for modern systems to be composed
of a variety of interacting services, running across
multiple machines in such a way that most developers do
not really understand the whole system. As abstraction
is layered atop abstraction, developers gain the
ability to compose systems of extraordinary complexity
with relative ease. However, many software properties,
especially those that cut across abstraction layers,
become very difficult to understand in such
compositions. The communication patterns involved, the
privacy of critical data, and the provenance of
information, can be difficult to find and understand,
even with access to all of the source code. The goal of
Dataflow Tomography is to use the inherent information
flow of such systems to help visualize the interactions
between complex and interwoven components across
multiple layers of abstraction. In the same way that
the injection of short-lived radioactive isotopes help
doctors trace problems in the cardiovascular system,
the use of ``data tagging'' can help developers slice
through the extraneous layers of software and pin-point
those portions of the system interacting with the data
of interest. To demonstrate the feasibility of this
approach we have developed a prototype system in which
tags are tracked both through the machine and in
between machines over the network, and from which novel
visualizations of the whole system can be derived. We
describe the system-level challenges in creating a
working system tomography tool and we qualitatively
evaluate our system by examining several example real
world scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahn:2012:ISE,
author = "Jung Ho Ahn and Norman P. Jouppi and Christos
Kozyrakis and Jacob Leverich and Robert S. Schreiber",
title = "Improving System Energy Efficiency with Memory Rank
Subsetting",
journal = j-TACO,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133386",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "VLSI process technology scaling has enabled dramatic
improvements in the capacity and peak bandwidth of DRAM
devices. However, current standard DDR x DIMM memory
interfaces are not well tailored to achieve high energy
efficiency and performance in modern
chip-multiprocessor-based computer systems. Their
suboptimal performance and energy inefficiency can have
a significant impact on system-wide efficiency since
much of the system power dissipation is due to memory
power. New memory interfaces, better suited for future
many-core systems, are needed. In response, there are
recent proposals to enhance the energy efficiency of
main-memory systems by dividing a memory rank into
subsets, and making a subset rather than a whole rank
serve a memory request. We holistically assess the
effectiveness of rank subsetting from system-wide
performance, energy-efficiency, and reliability
perspectives. We identify the impact of rank subsetting
on memory power and processor performance analytically,
compare two promising rank-subsetting proposals,
Multicore DIMM and mini-rank, and verify our analysis
by simulating a chip-multiprocessor system using
multithreaded and consolidated workloads. We extend the
design of Multicore DIMM for high-reliability systems
and show that compared with conventional chipkill
approaches, rank subsetting can lead to much higher
system-level energy efficiency and performance at the
cost of additional DRAM devices. This holistic
assessment shows that rank subsetting offers compelling
alternatives to existing processor-memory interfaces
for future DDR systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2012:CGC,
author = "Xuejun Yang and Li Wang and Jingling Xue and Qingbo
Wu",
title = "Comparability Graph Coloring for Optimizing
Utilization of Software-Managed Stream Register Files
for Stream Processors",
journal = j-TACO,
volume = "9",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133387",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The stream processors represent a promising
alternative to traditional cache-based general-purpose
processors in achieving high performance in stream
applications (media and some scientific applications).
In a stream programming model for stream processors, an
application is decomposed into a sequence of kernels
operating on streams of data. During the execution of a
kernel on a stream processor, all streams accessed must
be communicated through a nonbypassing software-managed
on-chip memory, the SRF (Stream Register File).
Optimizing utilization of the scarce on-chip memory is
crucial for good performance. The key insight is that
the interference graphs (IGs) formed by the streams in
stream applications tend to be comparability graphs or
decomposable into a set of comparability graphs. We
present a compiler algorithm for finding optimal or
near-optimal colorings, that is, SRF allocations in
stream IGs, by computing a maximum spanning forest of
the sub-IG formed by long live ranges, if necessary.
Our experimental results validate the optimality and
near-optimality of our algorithm by comparing it with
an ILP solver, and show that our algorithm yields
improved SRF utilization over the First-Fit bin-packing
algorithm, the best in the literature.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Majumdar:2012:MPE,
author = "Abhinandan Majumdar and Srihari Cadambi and Michela
Becchi and Srimat T. Chakradhar and Hans Peter Graf",
title = "A Massively Parallel, Energy Efficient Programmable
Accelerator for Learning and Classification",
journal = j-TACO,
volume = "9",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133388",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Applications that use learning and classification
algorithms operate on large amounts of unstructured
data, and have stringent performance constraints. For
such applications, the performance of general purpose
processors scales poorly with data size because of
their limited support for fine-grained parallelism and
absence of software-managed caches. The large
intermediate data in these applications also limits
achievable performance on many-core processors such as
GPUs. To accelerate such learning applications, we
present a programmable accelerator that can execute
multiple learning and classification algorithms. To
architect such an accelerator, we profile five
representative workloads, and find that their
computationally intensive portions can be formulated as
matrix or vector operations generating large amounts of
intermediate data, which are then reduced by a
secondary operation such as array ranking, finding
max/min and aggregation. Our proposed accelerator,
called MAPLE, has hundreds of simple processing
elements (PEs) laid out in a two-dimensional grid, with
two key features. First, it uses dynamic in-memory
processing where on-chip memory blocks perform the
secondary reduction operations. Second, MAPLE uses
banked off-chip memory, and organizes its PEs into
independent groups each with its own off-chip memory
bank. These two features allow MAPLE to scale its
performance with data size. We also present an Atom
based energy-efficient heterogeneous system with MAPLE
as the accelerator that satisfies the application's
performance requirements at a lower system power. This
article describes the MAPLE architecture, explores its
design space with a simulator, illustrates how to
automatically map application kernels to the hardware,
and presents its performance improvement and energy
benefits over classic server-based implementations. We
implement a 512-PE FPGA prototype of MAPLE and find
that it is 1.5-10x faster than a 2.5 GHz quad-core Xeon
processor despite running at a modest 125 MHz clock
rate. With MAPLE connected to a 1.6GHz dual-core Atom,
we show an energy improvement of 38--84\% over the Xeon
server coupled to a 1.3 GHz 240 core Tesla GPU.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eyerman:2012:PMJ,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Probabilistic modeling for job symbiosis scheduling on
{SMT} processors",
journal = j-TACO,
volume = "9",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207223",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Symbiotic job scheduling improves simultaneous
multithreading (SMT) processor performance by
coscheduling jobs that have ``compatible'' demands on
the processor's shared resources. Existing approaches
however require a sampling phase, evaluate a limited
number of possible coschedules, use heuristics to gauge
symbiosis, are rigid in their optimization target, and
do not preserve system-level priorities/shares. This
article proposes probabilistic job symbiosis modeling,
which predicts whether jobs will create positive or
negative symbiosis when coscheduled without requiring
the coschedule to be evaluated. The model, which uses
per-thread cycle stacks computed through a previously
proposed cycle accounting architecture, is simple
enough to be used in system software. Probabilistic job
symbiosis modeling provides six key innovations over
prior work in symbiotic job scheduling: (i) it does not
require a sampling phase, (ii) it readjusts the job
coschedule continuously, (iii) it evaluates a large
number of possible coschedules at very low overhead,
(iv) it is not driven by heuristics, (v) it can
optimize a performance target of interest (e.g., system
throughput or job turnaround time), and (vi) it
preserves system-level priorities/shares. These
innovations make symbiotic job scheduling both
practical and effective. Our experimental evaluation,
which assumes a realistic scenario in which jobs come
and go, reports an average 16\% (and up to 35\%)
reduction in job turnaround time compared to the
previously proposed SOS (sample, optimize, symbios)
approach for a two-thread SMT processor, and an average
19\% (and up to 45\%) reduction in job turnaround time
for a four-thread SMT processor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Seghir:2012:IAT,
author = "Rachid Seghir and Vincent Loechner and Beno{\^\i}t
Meister",
title = "Integer affine transformations of parametric
{$Z$}-polytopes and applications to loop nest
optimization",
journal = j-TACO,
volume = "9",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207224",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The polyhedral model is a well-known compiler
optimization framework for the analysis and
transformation of affine loop nests. We present a new
method to solve a difficult geometric operation that is
raised by this model: the integer affine transformation
of parametric $Z$-polytopes. The result of such a
transformation is given by a worst-case exponential
union of $Z$-polytopes. We also propose a polynomial
algorithm (for fixed dimension), to count points in
arbitrary unions of a fixed number of parametric
$Z$-polytopes. We implemented these algorithms and
compared them to other existing algorithms, for a set
of applications to loop nest analysis and
optimization.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2012:UOC,
author = "Yi Yang and Ping Xiang and Jingfei Kong and Mike
Mantor and Huiyang Zhou",
title = "A unified optimizing compiler framework for different
{GPGPU} architectures",
journal = j-TACO,
volume = "9",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207225",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents a novel optimizing compiler for
general purpose computation on graphics processing
units (GPGPU). It addresses two major challenges of
developing high performance GPGPU programs: effective
utilization of GPU memory hierarchy and judicious
management of parallelism. The input to our compiler is
a na{\"\i}ve GPU kernel function, which is functionally
correct but without any consideration for performance
optimization. The compiler generates two kernels, one
optimized for global memories and the other for texture
memories. The proposed compilation process is effective
for both AMD/ATI and NVIDIA GPUs. The experiments show
that our optimized code achieves very high performance,
either superior or very close to highly fine-tuned
libraries.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jang:2012:ACO,
author = "Choonki Jang and Jaejin Lee and Bernhard Egger and
Soojung Ryu",
title = "Automatic code overlay generation and partially
redundant code fetch elimination",
journal = j-TACO,
volume = "9",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207226",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "There is an increasing interest in explicitly managed
memory hierarchies, where a hierarchy of distinct
memories is exposed to the programmer and managed
explicitly in software. These hierarchies can be found
in typical embedded systems and an emerging class of
multicore architectures. To run an application that
requires more code memory than the available
higher-level memory, typically an overlay structure is
needed. The overlay structure is generated manually by
the programmer or automatically by a specialized
linker. Manual code overlaying requires the programmer
to deeply understand the program structure for maximum
memory savings as well as minimum performance
degradation. Although the linker can automatically
generate the code overlay structure, its memory savings
are limited and it even brings significant performance
degradation because traditional techniques do not
consider the program context. In this article, we
propose an automatic code overlay generation technique
that overcomes the limitations of traditional automatic
code overlaying techniques. We are dealing with a
system context that imposes two distinct constraints:
(1) no hardware support for address translation and (2)
a spatially and temporally coarse grained faulting
mechanism at the function level. Our approach addresses
those two constraints as efficiently as possible. Our
technique statically computes the Worst-Case Number of
Conflict misses (WCNC) between two different code
segments using path expressions. Then, it constructs a
static temporal relationship graph with the WCNCs and
emits an overlay structure for a given higher-level
memory size. We also propose an inter-procedural
partial redundancy elimination technique that minimizes
redundant code copying caused by the generated overlay
structure. Experimental results show that our approach
is promising.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abbasi:2012:TSW,
author = "Zahra Abbasi and Georgios Varsamopoulos and Sandeep K.
S. Gupta",
title = "{TACOMA}: Server and workload management in {Internet}
data centers considering cooling-computing power
trade-off and energy proportionality",
journal = j-TACO,
volume = "9",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207227",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A two-tier Internet data center management scheme,
TACOMA, with thermal-aware server provisioning (TASP)
in one tier, and thermal-aware workload distribution
(TAWD) in the other is proposed. TASP and TAWD
coordinate to maximize the energy savings by leveraging
the workload dynamics, at coarse and fine time scale,
respectively. TACOMA is aware of the QoS constraints,
the energy proportionality of servers, and the
potential trade-off between cooling and computing
power. The obtained energy savings are a combination of
suspending idle servers, using servers at their peak
efficiency, and avoiding heat recirculation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lankes:2012:BSP,
author = "Andreas Lankes and Thomas Wild and Stefan Wallentowitz
and Andreas Herkersdorf",
title = "Benefits of selective packet discard in
networks-on-chip",
journal = j-TACO,
volume = "9",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207228",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Today, Network on Chip concepts principally assume
inherent lossless operation. Considering that future
nanometer CMOS technologies will witness increased
sensitivity to all forms of manufacturing and
environmental variations (e.g., IR drop, soft errors
due to radiation, transient temperature induced timing
problems, device aging), efforts to cope with data
corruption or packet loss will be unavoidable. Possible
counter measures against packet loss are the extension
of flits with ECC or the introduction of error
detection with retransmission. We propose to make use
of the perceived deficiency of packet loss as a
feature. By selectively discarding stuck packets in the
NoC, a proven practice in computer networks, all types
of deadlocks can be resolved. This is especially
advantageous for solving the problem of
message-dependent deadlocks, which otherwise leads to
high costs either in terms of throughput or chip area.
Strict ordering, the most popular approach to this
problem, results in a significant buffer overhead and a
more complex router architecture. In addition, we will
show that eliminating local network congestions by
selectively discarding individual packets also can
improve the effective throughput of the network. The
end-to-end retransmission mechanism required for the
reliable communication, then also provides lossless
communication for the cores.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2012:DDS,
author = "Yangchun Luo and Antonia Zhai",
title = "Dynamically dispatching speculative threads to improve
sequential execution",
journal = j-TACO,
volume = "9",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355586",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Efficiently utilizing multicore processors to improve
their performance potentials demands extracting
thread-level parallelism from the applications. Various
novel and sophisticated execution models have been
proposed to extract thread-level parallelism from
sequential programs. One such execution model,
Thread-Level Speculation (TLS), allows potentially
dependent threads to execute speculatively in parallel.
However, TLS execution is inherently unpredictable, and
consequently incorrect speculation could degrade
performance for the multicore systems. Existing
approaches have focused on using the compilers to
select sequential program regions to apply TLS. Our
research shows that even the state-of-the-art compiler
makes suboptimal decisions, due to the unpredictability
of TLS execution. Thus, we propose to dynamically
optimize TLS performance. This article describes the
design, implementation, and evaluation of a runtime
thread dispatching mechanism that adjusts the behaviors
of speculative threads based on their efficiency. In
the proposed system, speculative threads are monitored
by hardware-based performance counters and their
performance impact is evaluated with a novel
methodology that takes into account various unique TLS
characteristics. Thread dispatching policies are
devised to adjust the behaviors of speculative threads
accordingly. With the help of the runtime evaluation,
where and how to create speculative threads is better
determined. Evaluated with all the SPEC CPU2000
benchmark programs written in C, the dynamic
dispatching system outperforms the state-of-the-art
compiler-based thread management techniques by 9.4\% on
average. Comparing to sequential execution, we achieve
1.37X performance improvement on a four-core CMP-based
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cui:2012:EPO,
author = "Huimin Cui and Jingling Xue and Lei Wang and Yang Yang
and Xiaobing Feng and Dongrui Fan",
title = "Extendable pattern-oriented optimization directives",
journal = j-TACO,
volume = "9",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355587",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Algorithm-specific, that is, semantic-specific
optimizations have been observed to bring significant
performance gains, especially for a diverse set of
multi/many-core architectures. However, current
programming models and compiler technologies for the
state-of-the-art architectures do not exploit well
these performance opportunities. In this article, we
propose a pattern-making methodology that enables
algorithm-specific optimizations to be encapsulated
into ``optimization patterns''. Such optimization
patterns are expressed in terms of preprocessor
directives so that simple annotations can result in
significant performance improvements. To validate this
new methodology, a framework, named EPOD, is developed
to map these directives into the underlying
optimization schemes for a particular architecture. It
is difficult to create an exact performance model to
determine an optimal or near-optimal optimization
scheme (including which optimizations to apply and in
which order) for a specific application, due to the
complexity of applications and architectures. However,
it is trackable to build individual optimization
components and let compiler developers synthesize an
optimization scheme from these components. Therefore,
our EPOD framework provides an Optimization Programming
Interface (OPI) for compiler developers to define new
optimization schemes. Thus, new patterns can be
integrated into EPOD in a flexible manner. We have
identified and implemented a number of optimization
patterns for three representative computer platforms.
Our experimental results show that a pattern-guided
compiler can outperform the state-of-the-art compilers
and even achieve performance as competitive as
hand-tuned code. Therefore, such a pattern-making
methodology represents an encouraging direction for
domain experts' experience and knowledge to be
integrated into general-purpose compilers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lewis:2012:REC,
author = "Adam Wade Lewis and Nian-Feng Tzeng and Soumik Ghosh",
title = "Runtime energy consumption estimation for server
workloads based on chaotic time-series approximation",
journal = j-TACO,
volume = "9",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355588",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes a runtime model that relates
server energy consumption to its overall thermal
envelope, using hardware performance counters and
experimental measurements. While previous studies have
attempted system-wide modeling of server power
consumption through subsystem models, our approach is
different in that it links system energy input to
subsystem energy consumption based on a small set of
tightly correlated parameters. The proposed model takes
into account processor power, bus activities, and
system ambient temperature for real-time prediction on
the power consumption of long running jobs. Using the
HyperTransport and QuickPath Link structures as case
studies and through electrical measurements on example
server subsystems, we develop a chaotic time-series
approximation for runtime power consumption, arriving
at the Chaotic Attractor Predictor (CAP). With
polynomial time complexity, CAP exhibits high
prediction accuracy, having the prediction errors
within 1.6\% (or 3.3\%) for servers based on the
HyperTransport bus (or the QuickPath Links), as
verified by a set of common processor benchmarks. Our
CAP is a superior predictive mechanism over existing
linear auto-regressive methods, which require expensive
and complex corrective steps to address the nonlinear
and chaotic aspects of the underlying physical
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Valero:2012:CRI,
author = "Alejandro Valero and Julio Sahuquillo and Salvador
Petit and Pedro L{\'o}pez and Jos{\'e} Duato",
title = "Combining recency of information with selective random
and a victim cache in last-level caches",
journal = j-TACO,
volume = "9",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355589",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory latency has become an important performance
bottleneck in current microprocessors. This problem
aggravates as the number of cores sharing the same
memory controller increases. To palliate this problem,
a common solution is to implement cache hierarchies
with large or huge Last-Level Cache (LLC)
organizations. LLC memories are implemented with a high
number of ways (e.g., 16) to reduce conflict misses.
Typically, caches have implemented the LRU algorithm to
exploit temporal locality, but its performance goes
away from the optimal as the number of ways increases.
In addition, the implementation of a strict LRU
algorithm is costly in terms of area and power. This
article focuses on a family of low-cost replacement
strategies, whose implementation scales with the number
of ways while maintaining the performance. The proposed
strategies track the accessing order for just a few
blocks, which cannot be replaced. The victim is
randomly selected among those blocks exhibiting poor
locality. Although, in general, the random policy helps
improving the performance, in some applications the
scheme fails with respect to the LRU policy leading to
performance degradation. This drawback can be overcome
by the addition of a small victim cache of the large
LLC. Experimental results show that, using the best
version of the family without victim cache, MPKI
reduction falls in between 10\% and 11\% compared to a
set of the most representative state-of-the-art
algorithms, whereas the reduction grows up to 22\% with
respect to LRU. The proposal with victim cache achieves
speedup improvements, on average, by 4\% compared to
LRU. In addition, it reduces dynamic energy, on
average, up to 8\%. Finally, compared to the studied
algorithms, hardware complexity is largely reduced by
the baseline algorithm of the family.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2012:DQM,
author = "Bin Li and Li-Shiuan Peh and Li Zhao and Ravi Iyer",
title = "Dynamic {QoS} management for chip multiprocessors",
journal = j-TACO,
volume = "9",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355590",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the continuing scaling of semiconductor
technologies, chip multiprocessor (CMP) has become the
de facto design for modern high performance computer
architectures. It is expected that more and more
applications with diverse requirements will run
simultaneously on the CMP platform. However, this will
exert contention on shared resources such as the last
level cache, network-on-chip bandwidth and off-chip
memory bandwidth, thus affecting the performance and
quality-of-service (QoS) significantly. In this
environment, efficient resource sharing and a guarantee
of a certain level of performance is highly desirable.
Researchers have proposed different frameworks for
providing QoS. Most of these frameworks focus on
individual resource for QoS management. Coordinated
management of multiple QoS-aware shared resources at
runtime remains an open problem. Recently, there has
been work that proposed a class-of-serviced based
framework to jointly managing cache, NoC and memory
resources simultaneously. However, the work allocates
shared resources statically at the beginning of
application runtime, and do not dynamically track,
manage and share shared resources across applications.
In this article, we address this limitation by
proposing dynamic resource management policies that
monitor the resource usage of applications at runtime,
then steals resources from the high-priority
applications for lower-priority ones. The goal is to
maintain the targeted level of performance for
high-priority applications while improving the
performance of lower-priority applications. We use a PI
(Proportional-Integral gain) feedback controller based
technique to maintain stability in our framework. Our
evaluation results show that our policy can improve
performance for lower-priority applications
significantly while maintaining the performance for
high-priority application, thus demonstrating the
effectiveness of our dynamic QoS resource management
policy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xekalakis:2012:MSM,
author = "Polychronis Xekalakis and Nikolas Ioannou and Marcelo
Cintra",
title = "Mixed speculative multithreaded execution models",
journal = j-TACO,
volume = "9",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355591",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The current trend toward multicore architectures has
placed great pressure on programmers and compilers to
generate thread-parallel programs. Improved execution
performance can no longer be obtained via traditional
single-thread instruction level parallelism (ILP), but,
instead, via multithreaded execution. One notable
technique that facilitates the extraction of parallel
threads from sequential applications is thread-level
speculation (TLS). This technique allows
programmers/compilers to generate threads without
checking for inter-thread data and control dependences,
which are then transparently enforced by the hardware.
Most prior work on TLS has concentrated on thread
selection and mechanisms to efficiently support the
main TLS operations, such as squashes, data versioning,
and commits. This article seeks to enhance TLS
functionality by combining it with other speculative
multithreaded execution models. The main idea is that
TLS already requires extensive hardware support, which
when slightly augmented can accommodate other
speculative multithreaded techniques. Recognizing that
for different applications, or even program phases, the
application bottlenecks may be different, it is
reasonable to assume that the more versatile a system
is, the more efficiently it will be able to execute the
given program. Toward this direction, we first show
that mixed execution models that combine TLS with
Helper Threads (HT), RunAhead execution (RA) and
MultiPath execution (MP) perform better than any of the
models alone. Based on a simple model that we propose,
we show that benefits come from being able to extract
additional ILP without harming the TLP extracted by
TLS. We then show that by combining all the execution
models in a unified one that combines all these
speculative multithreaded models, ILP can be further
enhanced with only minimal additional cost in
hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sharafeddine:2012:DOE,
author = "Mageda Sharafeddine and Komal Jothi and Haitham
Akkary",
title = "Disjoint out-of-order execution processor",
journal = j-TACO,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355592",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-performance superscalar architectures used to
exploit instruction level parallelism in single-thread
applications have become too complex and power hungry
for the multicore processors era. We propose a new
architecture that uses multiple small latency-tolerant
out-of-order cores to improve single-thread
performance. Improving single-thread performance with
multiple small out-of-order cores allows designers to
place more of these cores on the same die.
Consequently, emerging highly parallel applications can
take full advantage of the multicore parallel hardware
without sacrificing performance of inherently serial
and hard to parallelize applications. Our architecture
combines speculative multithreading (SpMT) with
checkpoint recovery and continual flow pipeline
architectures. It splits single-thread program
execution into disjoint control and data threads that
execute concurrently on multiple cooperating small and
latency-tolerant out-of-order cores. Hence we call this
style of execution Disjoint Out-of-Order Execution
(DOE). DOE uses latency tolerance to overcome
performance issues of SpMT caused by interthread data
dependences. To evaluate this architecture, we have
developed a microarchitecture performance model of DOE
based on PTLSim, a simulation infrastructure of the x86
instruction set architecture. We evaluate the potential
performance of DOE processor architecture using a
simple heuristic to fork control independent threads in
hardware at the target addresses of future procedure
return instructions. Using applications from SpecInt
2000, we study DOE under ideal as well as realistic
architectural constraints. We discuss the performance
impact of key DOE architecture and application
variables such as number of cores, interthread data
dependences, intercore data communication delay,
buffers capacity, and branch mispredictions. Without
any DOE specific compiler optimizations, our results
show that DOE outperforms conventional SpMT
architectures by 15\%, on average. We also show that
DOE with four small cores can perform on average
equally well to a large superscalar core, consuming
about the same power. Most importantly, DOE improves
throughput performance by a significant amount over a
large superscalar core, up to 2.5 times, when running
multitasking applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Andrade:2012:SAW,
author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Static analysis of the worst-case memory performance
for irregular codes with indirections",
journal = j-TACO,
volume = "9",
number = "3",
pages = "20:1--20:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355593",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Real-time systems are subject to timing constraints,
whose upper bound is given by the Worst-Case Execution
Time (WCET). Cache memory behavior is difficult to
predict analytically and estimating a safe and precise
worst-case value is even more challenging. The
worst-case memory performance (WCMP) component of the
WCET can only be estimated with the precise knowledge
of the stream of data addresses accessed by the code,
which is determined by the access patterns and the base
addresses of the data structures accessed. The
regularity of strided access patterns simplifies their
analysis, as they are characterized by relatively few
parameters, which are often available at compile time.
Unfortunately codes may exhibit irregular access
patterns, which are much more difficult to statically
analyze. As for the base addresses of the data
structures, they are not always available at
compile-time for many reasons: stack variables,
dynamically allocated memory, modules compiled
separately, etc. This article addresses these problems
by presenting a model that predicts an \%safe and upper
bound of the data cache performance for codes both with
regular and irregular access patterns, which is valid
for any possible base addresses of the data structures.
The model analyzes irregular access patterns due to the
presence of indirections in the code and it can provide
two kinds of predictions: a safe hard boundary that is
suitable for hard real-time systems and a soft boundary
whose safeness is not guaranteed but which is valid
most of the times. In fact, in all our experiments the
number of misses was below the soft boundary predicted
by the model. This turns this soft boundary prediction
into a valuable tool, particularly for non and soft
real-time systems, which tolerate a percentage of the
runs exceeding their deadlines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2012:DIO,
author = "Yang Chen and Shuangde Fang and Yuanjie Huang and
Lieven Eeckhout and Grigori Fursin and Olivier Temam
and Chengyong Wu",
title = "Deconstructing iterative optimization",
journal = j-TACO,
volume = "9",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355594",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Iterative optimization is a popular compiler
optimization approach that has been studied extensively
over the past decade. In this article, we deconstruct
iterative optimization by evaluating whether it works
across datasets and by analyzing why it works. Up to
now, most iterative optimization studies are based on a
premise which was never truly evaluated: that it is
possible to learn the best compiler optimizations
across datasets. In this article, we evaluate this
question for the first time with a very large number of
datasets. We therefore compose KDataSets, a dataset
suite with 1000 datasets for 32 programs, which we
release to the public. We characterize the diversity of
KDataSets, and subsequently use it to evaluate
iterative optimization. For all 32 programs, we find
that there exists at least one combination of compiler
optimizations that achieves at least 83\% or more of
the best possible speedup across all datasets on two
widely used compilers (Intel's ICC and GNU's GCC). This
optimal combination is program-specific and yields
speedups up to 3.75$ \times $ (averaged across datasets
of a program) over the highest optimization level of
the compilers (-O3 for GCC and -fast for ICC). This
finding suggests that optimizing programs across
datasets might be much easier than previously
anticipated. In addition, we evaluate the idea of
introducing compiler choice as part of iterative
optimization. We find that it can further improve the
performance of iterative optimization because different
programs favor different compilers. We also investigate
why iterative optimization works by analyzing the
optimal combinations. We find that only a handful
optimizations yield most of the speedup. Finally, we
show that optimizations interact in a complex and
sometimes counterintuitive way through two case
studies, which confirms that iterative optimization is
an irreplaceable and important compiler strategy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Guha:2012:MOD,
author = "Apala Guha and Kim Hazelwood and Mary Lou Soffa",
title = "Memory optimization of dynamic binary translators for
embedded systems",
journal = j-TACO,
volume = "9",
number = "3",
pages = "22:1--22:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355595",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic binary translators (DBTs) are becoming
increasingly important because of their power and
flexibility. DBT-based services are valuable for all
types of platforms. However, the high memory demands of
DBTs present an obstacle for embedded systems. Most
research on DBT design has a performance focus, which
often drives up the DBT memory demand. In this article,
we present a memory-oriented approach to DBT design. We
consider the class of translation-based DBTs and their
sources of memory demand; cached translated code,
cached auxiliary code and DBT data structures. We
explore aspects of DBT design that impact these memory
demand sources and present strategies to mitigate
memory demand. We also explore performance
optimizations for DBTs that handle memory demand by
placing a limit on it, and repeatedly flush
translations to stay within the limit, thereby
replacing the memory demand problem with a performance
degradation problem. Our optimizations that mitigate
memory demand improve performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Geraci:2012:TFP,
author = "James R. Geraci and Sharon M. Sacco",
title = "A transpose-free in-place {SIMD} optimized {FFT}",
journal = j-TACO,
volume = "9",
number = "3",
pages = "23:1--23:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355596",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A transpose-free in-place SIMD optimized algorithm for
the computation of large FFTs is introduced and
implemented on the Cell Broadband Engine. Six different
FFT implementations of the algorithm using six
different data movement methods are described. Their
relative performance is compared for input sizes from $
2^{17} $ to $ 2^{21} $ complex floating point samples.
Large differences in performance are observed among
even theoretically equivalent data movement patterns.
All six implementations compare favorably with FFTW and
other previous FFT implementations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Coppens:2013:FDB,
author = "Bart Coppens and Bjorn {De Sutter} and Jonas Maebe",
title = "Feedback-driven binary code diversification to the
special issue on high-performance embedded
architectures and compilers",
journal = j-TACO,
volume = "9",
number = "4",
pages = "24:1--24:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400683",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As described in many blog posts and in the scientific
literature, exploits for software vulnerabilities are
often engineered on the basis of patches. For example,
``Microsoft Patch Tuesday'' is often followed by
``Exploit Wednesday'' during which yet unpatched
systems become vulnerable to patch-based exploits. Part
of the patch engineering includes the identification of
the vulnerable binary code by means of
reverse-engineering tools and diffing add-ons. In this
article we present a feedback-driven compiler tool flow
that iteratively transforms code until diffing tools
become ineffective enough to close the ``Exploit
Wednesday'' window of opportunity. We demonstrate the
tool's effectiveness on a set of real-world patches and
against the latest version of BinDiff.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fowers:2013:PEC,
author = "Jeremy Fowers and Greg Brown and John Wernsing and
Greg Stitt",
title = "A performance and energy comparison of convolution on
{GPUs}, {FPGAs}, and multicore processors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "25:1--25:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400684",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent architectural trends have focused on increased
parallelism via multicore processors and increased
heterogeneity via accelerator devices (e.g.,
graphics-processing units, field-programmable gate
arrays). Although these architectures have significant
performance and energy potential, application designers
face many device-specific challenges when choosing an
appropriate accelerator or when customizing an
algorithm for an accelerator. To help address this
problem, in this article we thoroughly evaluate
convolution, one of the most common operations in
digital-signal processing, on multicores,
graphics-processing units, and field-programmable gate
arrays. Whereas many previous application studies
evaluate a specific usage of an application, this
article assists designers with design space exploration
for numerous use cases by analyzing effects of
different input sizes, different algorithms, and
different devices, while also determining
Pareto-optimal trade-offs between performance and
energy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rohou:2013:VTI,
author = "Erven Rohou and Kevin Williams and David Yuste",
title = "Vectorization technology to improve interpreter
performance",
journal = j-TACO,
volume = "9",
number = "4",
pages = "26:1--26:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400685",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the present computing landscape, interpreters are
in use in a wide range of systems. Recent trends in
consumer electronics have created a new category of
portable, lightweight software applications. Typically,
these applications have fast development cycles and
short life spans. They run on a wide range of systems
and are deployed in a target independent bytecode
format over Internet and cellular networks. Their
authors are untrusted third-party vendors, and they are
executed in secure managed runtimes or virtual
machines. Furthermore, due to security policies or
development time constraints, these virtual machines
often lack just-in-time compilers and rely on
interpreted execution. At the other end of the
spectrum, interpreters are also a reality in the field
of high-performance computations because of the
flexibility they provide. The main performance penalty
in interpreters arises from instruction dispatch. Each
bytecode requires a minimum number of machine
instructions to be executed. In this work, we introduce
a novel approach for interpreter optimization that
reduces instruction dispatch thanks to vectorization
technology. We extend the split compilation paradigm to
interpreters, thus guaranteeing that our approach
exhibits almost no overhead at runtime. We take
advantage of the vast research in vectorization and its
presence in modern compilers. Complex analyses are
performed ahead of time, and their results are conveyed
to the executable bytecode. At runtime, the interpreter
retrieves this additional information to build the SIMD
IR (intermediate representation) instructions that
carry the vector semantics. The bytecode language
remains unmodified, making this representation
compatible with legacy interpreters and previously
proposed JIT compilers. We show that this approach
drastically reduces the number of instructions to
interpret and decreases execution time of vectorizable
applications. Moreover, we map SIMD IR instructions to
hardware SIMD instructions when available, with a
substantial additional improvement. Finally, we finely
analyze the impact of our extension on the behavior of
the caches and branch predictors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cleary:2013:FAT,
author = "Jimmy Cleary and Owen Callanan and Mark Purcell and
David Gregg",
title = "Fast asymmetric thread synchronization",
journal = j-TACO,
volume = "9",
number = "4",
pages = "27:1--27:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400686",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "For most multi-threaded applications, data structures
must be shared between threads. Ensuring thread safety
on these data structures incurs overhead in the form of
locking and other synchronization mechanisms. Where
data is shared among multiple threads these costs are
unavoidable. However, a common access pattern is that
data is accessed primarily by one dominant thread, and
only very rarely by the other, non-dominant threads.
Previous research has proposed biased locks, which are
optimized for a single dominant thread, at the cost of
greater overheads for non-dominant threads. In this
article we propose a new family of biased
synchronization mechanisms that, using a modified
interface, push accesses to shared data from the
non-dominant threads to the dominant one, via a novel
set of message passing mechanisms. We present
mechanisms for protecting critical sections, for
queueing work, for caching shared data in registers
where it is safe to do so, and for asynchronous
critical section accesses. We present results for the
conventional Intel\reg{} Sandy Bridge processor and for
the emerging network-optimized many-core IBM\reg{}
PowerENTM processor. We find that our algorithms
compete well with existing biased locking algorithms,
and, in particular, perform better than existing
algorithms as accesses from non-dominant threads
increase.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2013:PTL,
author = "Yong Li and Rami Melhem and Alex K. Jones",
title = "{PS-TLB}: Leveraging page classification information
for fast, scalable and efficient translation for future
{CMPs}",
journal = j-TACO,
volume = "9",
number = "4",
pages = "28:1--28:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400687",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Traversing the page table during virtual to physical
address translation causes pipeline stalls when misses
occur in the translation-lookaside buffer (TLB).
State-of-the-art translation proposals typically
optimize a single aspect of translation performance
(e.g., translation sharing, context switch performance,
etc.) with potential trade-offs of additional hardware
complexity, increased translation latency, or reduced
scalability. In this article, we propose the partial
sharing TLB (PS-TLB), a fast and scalable solution that
reduces off-chip translation misses without sacrificing
the timing-critical requirement of on-chip translation.
We introduce the partial sharing buffer (PSB) which
leverages application page sharing characteristics
using minimal additional hardware resources. Compared
to the leading TLB proposal that leverages sharing,
PS-TLB provides a more than 45\% improvement in
translation latency with a 9\% application speedup
while using fewer storage resources. In addition, the
page classification and PS-TLB architecture provide
further optimizations including an over 30\% reduction
of interprocessor interrupts for coherence, and reduced
context switch misses with fewer resources compared
with existing methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{DuBois:2013:PTC,
author = "Kristof {Du Bois} and Stijn Eyerman and Lieven
Eeckhout",
title = "Per-thread cycle accounting in multicore processors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "29:1--29:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400688",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "While multicore processors improve overall chip
throughput and hardware utilization, resource sharing
among the cores leads to unpredictable performance for
the individual threads running on a multicore
processor. Unpredictable per-thread performance becomes
a problem when considered in the context of multicore
scheduling: system software assumes that all threads
make equal progress, however, this is not what the
hardware provides. This may lead to problems at the
system level such as missed deadlines, reduced
quality-of-service, non-satisfied service-level
agreements, unbalanced parallel performance, priority
inversion, unpredictable interactive performance, etc.
This article proposes a hardware-efficient per-thread
cycle accounting architecture for multicore processors.
The counter architecture tracks per-thread progress in
a multicore processor, detects how inter-thread
interference affects per-thread performance, and
predicts the execution time for each thread if run in
isolation. The counter architecture captures the
effects of additional conflict misses due to cache
sharing as well as increased latency for other memory
accesses due to resource and bandwidth contention in
the memory subsystem. The proposed method accounts for
74.3\% of the interference cycles, and estimates
per-thread progress within 14.2\% on average across a
large set of multi-program workloads. Hardware cost is
limited to 7.44KB for an 8-core processor, a reduction
by almost $ 10 \times $ compared to prior work while
being 63.8\% more accurate. Making system software
progress aware improves fairness by 22.5\% on average
over progress-agnostic scheduling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wimmer:2013:MAV,
author = "Christian Wimmer and Michael Haupt and Michael L. {Van
De Vanter} and Mick Jordan and Laurent Dayn{\`e}s and
Douglas Simon",
title = "{Maxine}: an approachable virtual machine for, and in,
{Java}",
journal = j-TACO,
volume = "9",
number = "4",
pages = "30:1--30:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400689",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A highly productive platform accelerates the
production of research results. The design of a Virtual
Machine (VM) written in the Java{\TM} programming
language can be simplified through exploitation of
interfaces, type and memory safety, automated memory
management (garbage collection), exception handling,
and reflection. Moreover, modern Java IDEs offer
time-saving features such as refactoring,
auto-completion, and code navigation. Finally, Java
annotations enable compiler extensions for low-level
``systems programming'' while retaining IDE
compatibility. These techniques collectively make
complex system software more ``approachable'' than has
been typical in the past. The Maxine VM, a metacircular
Java VM implementation, has aggressively used these
features since its inception. A co-designed companion
tool, the Maxine Inspector, offers integrated debugging
and visualization of all aspects of the VM's runtime
state. The Inspector's implementation exploits advanced
Java language features, embodies intimate knowledge of
the VM's design, and even reuses a significant amount
of VM code directly. These characteristics make Maxine
a highly approachable VM research platform and a
productive basis for research and teaching.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Khan:2013:SBA,
author = "Malik Khan and Protonu Basu and Gabe Rudy and Mary
Hall and Chun Chen and Jacqueline Chame",
title = "A script-based autotuning compiler system to generate
high-performance {CUDA} code",
journal = j-TACO,
volume = "9",
number = "4",
pages = "31:1--31:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400690",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents a novel compiler framework for
CUDA code generation. The compiler structure is
designed to support autotuning, which employs empirical
techniques to evaluate a set of alternative mappings of
computation kernels and select the mapping that obtains
the best performance. This article introduces a
Transformation Strategy Generator, a meta-optimizer
that generates a set of transformation recipes, which
are descriptions of the mapping of the sequential code
to parallel CUDA code. These recipes comprise a search
space of possible implementations. This system achieves
performance comparable and sometimes better than
manually tuned libraries and exceeds the performance of
a state-of-the-art GPU compiler.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{VanCraeynest:2013:UFD,
author = "Kenzo {Van Craeynest} and Lieven Eeckhout",
title = "Understanding fundamental design choices in
single-{ISA} heterogeneous multicore architectures",
journal = j-TACO,
volume = "9",
number = "4",
pages = "32:1--32:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400691",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single-ISA heterogeneous multicore processors have
gained substantial interest over the past few years
because of their power efficiency, as they offer the
potential for high overall chip throughput within a
given power budget. Prior work in heterogeneous
architectures has mainly focused on how heterogeneity
can improve overall system throughput. To what extent
heterogeneity affects per-program performance has
remained largely unanswered. In this article, we aim at
understanding how heterogeneity affects both chip
throughput and per-program performance; how
heterogeneous architectures compare to homogeneous
architectures under both performance metrics; and how
fundamental design choices, such as core type, cache
size, and off-chip bandwidth, affect performance. We
use analytical modeling to explore a large space of
single-ISA heterogeneous architectures. The analytical
model has linear-time complexity in the number of core
types and programs of interest, and offers a unique
opportunity for exploring the large space of both
homogeneous and heterogeneous multicore processors in
limited time. Our analysis provides several interesting
insights: While it is true that heterogeneity can
improve system throughput, it fundamentally trades
per-program performance for chip throughput; although
some heterogeneous configurations yield better
throughput and per-program performance than homogeneous
designs, some homogeneous configurations are optimal
for particular throughput versus per-program
performance trade-offs. Two core types provide most of
the benefits from heterogeneity and a larger number of
core types does not contribute much; job-to-core
mapping is both important and challenging for
heterogeneous multicore processors to achieve optimum
performance. Limited off-chip bandwidth does alter some
of the fundamental design choices in heterogeneous
multicore architectures, such as the need for large
on-chip caches for achieving high throughput, and
per-program performance degrading more relative to
throughput under constrained off-chip bandwidth.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Antao:2013:CFA,
author = "Samuel Ant{\~a}o and Leonel Sousa",
title = "The {CRNS} framework and its application to
programmable and reconfigurable cryptography",
journal = j-TACO,
volume = "9",
number = "4",
pages = "33:1--33:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400692",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes the Computing with the
ResidueNumber System (CRNS) framework, which aims at
the design automation of accelerators for Modular
Arithmetic (MA). The framework provides a comprehensive
set of tools ranging from a programming language and
respective compiler to back-ends targeting parallel
computation platforms such as Graphical Processing
Units (GPUs) and reconfigurable hardware. Given an
input algorithm described with a high-level programming
language, the CRNS can be used to obtain in a few
seconds the corresponding optimized Parallel Thread
Execution (PTX) program ready to be run on GPUs or the
Hardware Description Language (HDL) specification of a
fully functional accelerator suitable for
reconfigurable hardware and embedded systems. The
resulting framework's implementations benefit from the
Residue Number System (RNS) arithmetic's
parallelization properties in a fully automated way.
Designers do not need to be familiar with the
mathematical details concerning the employed
arithmetic, namely the RNS representation. In order to
thoroughly describe and evaluate the proposed
framework, experimental results obtained for the
supported back-ends (GPU and HDL) are presented
targeting the implementation of the modular
exponentiation used in the Rivest-Shamir-Adleman (RSA)
algorithm and Elliptic Curve (EC) point multiplication.
Results suggest competitive latency and throughput with
minimum design effort and overcoming all the
development issues that arise in the specification and
verification of dedicated solutions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Diouf:2013:DLM,
author = "Boubacar Diouf and Can Hantas and Albert Cohen and
{\"O}zcan {\"O}zturk and Jens Palsberg",
title = "A decoupled local memory allocator",
journal = j-TACO,
volume = "9",
number = "4",
pages = "34:1--34:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compilers use software-controlled local memories to
provide fast, predictable, and power-efficient access
to critical data. We show that the local memory
allocation for straight-line, or linearized programs is
equivalent to a weighted interval-graph coloring
problem. This problem is new when allowing a color
interval to ``wrap around,'' and we call it the
submarine-building problem. This graph-theoretical
decision problem differs slightly from the classical
ship-building problem, and exhibits very interesting
and unusual complexity properties. We demonstrate that
the submarine-building problem is NP-complete, while it
is solvable in linear time for not-so-proper interval
graphs, an extension of the class of proper interval
graphs. We propose a clustering heuristic to
approximate any interval graph into a not-so-proper
interval graph, decoupling spill code generation from
local memory assignment. We apply this heuristic to a
large number of randomly generated interval graphs
reproducing the statistical features of standard local
memory allocation benchmarks, comparing with
state-of-the-art heuristics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cui:2013:LOC,
author = "Huimin Cui and Qing Yi and Jingling Xue and Xiaobing
Feng",
title = "Layout-oblivious compiler optimization for matrix
computations",
journal = j-TACO,
volume = "9",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400694",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Most scientific computations serve to apply
mathematical operations to a set of preconceived data
structures, e.g., matrices, vectors, and grids. In this
article, we use a number of widely used matrix
computations from the LINPACK library to demonstrate
that complex internal organizations of data structures
can severely degrade the effectiveness of compiler
optimizations. We then present a data-layout-oblivious
optimization methodology, where by isolating an
abstract representation of the computations from
complex implementation details of their data, we enable
these computations to be much more accurately analyzed
and optimized through varying state-of-the-art compiler
technologies. We evaluated our approach on an Intel
8-core platform using two source-to-source compiler
infrastructures, Pluto and EPOD. Our results show that
while the efficiency of a computational kernel differs
when using different data layouts, the alternative
implementations typically benefit from a common set of
optimizations on the operations. Therefore separately
optimizing the operations and the data layout of a
computation could dramatically enhance the
effectiveness of compiler optimizations compared with
the conventional approaches of using a unified
representation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dolan:2013:CSL,
author = "Stephen Dolan and Servesh Muralidharan and David
Gregg",
title = "Compiler support for lightweight context switching",
journal = j-TACO,
volume = "9",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400695",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose a new language-neutral primitive for the
LLVM compiler, which provides efficient context
switching and message passing between lightweight
threads of control. The primitive, called Swapstack,
can be used by any language implementation based on
LLVM to build higher-level language structures such as
continuations, coroutines, and lightweight threads. As
part of adding the primitives to LLVM, we have also
added compiler support for passing parameters across
context switches. Our modified LLVM compiler produces
highly efficient code through a combination of exposing
the context switching code to existing compiler
optimizations, and adding novel compiler optimizations
to further reduce the cost of context switches. To
demonstrate the generality and efficiency of our
primitives, we add one-shot continuations to C++, and
provide a simple fiber library that allows millions of
fibers to run on multiple cores, with a work-stealing
scheduler and fast inter-fiber sychronization. We argue
that compiler-supported lightweight context switching
can be significantly faster than using a library to
switch between contexts, and provide experimental
evidence to support the position.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abad:2013:LLE,
author = "Pablo Abad and Valentin Puente and Jose-Angel
Gregorio",
title = "{LIGERO}: a light but efficient router conceived for
cache-coherent chip multiprocessors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400696",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although abstraction is the best approach to deal with
computing system complexity, sometimes implementation
details should be considered. Considering on-chip
interconnection networks in particular, underestimating
the underlying system specificity could have
nonnegligible impact on performance, cost, or
correctness. This article presents a very efficient
router that has been devised to deal with
cache-coherent chip multiprocessor particularities in a
balanced way. Employing the same principles of packet
rotation structures as in the rotary router, we present
a router configuration with the following novel
features: (1) reduced buffering requirements, (2)
optimized pipeline under contentionless conditions, (3)
more efficient deadlock avoidance mechanism, and (4)
optimized in-order delivery guarantee. Putting it all
together, our proposal provides a set of features that
no other router, to the best of our knowledge, has
achieved previously. These are: (1') low implementation
cost, (2') low pass-through latency under low load,
(3') improved resource utilization through adaptive
routing and a buffering scheme free of head-of-line
blocking, (4') guarantee of coherence protocol
correctness via end-to-end deadlock avoidance and
in-order delivery, and (5') improvement of coherence
protocol responsiveness through adaptive in-network
multicast support. We conduct a thorough evaluation
that includes hardware cost estimation and performance
evaluation under a wide spectrum of realistic workloads
and coherence protocols. Comparing our proposal with
VCTM, an optimized state-of-the-art wormhole router, it
requires 50\% less area, reduces on-chip cache
hierarchy energy delay product on average by 20\%, and
improves the cache-coherency chip multiprocessor
performance under realistic working conditions by up to
20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Albericio:2013:ERL,
author = "Jorge Albericio and Pablo Ib{\'a}{\~n}ez and
V{\'\i}ctor Vi{\~n}als and Jose Mar{\'\i}a
Llaber{\'\i}a",
title = "Exploiting reuse locality on inclusive shared
last-level caches",
journal = j-TACO,
volume = "9",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Optimization of the replacement policy used for Shared
Last-Level Cache (SLLC) management in a
Chip-MultiProcessor (CMP) is critical for avoiding
off-chip accesses. Temporal locality, while being
exploited by first levels of private cache memories, is
only slightly exhibited by the stream of references
arriving at the SLLC. Thus, traditional replacement
algorithms based on recency are bad choices for
governing SLLC replacement. Recent proposals involve
SLLC replacement policies that attempt to exploit reuse
either by segmenting the replacement list or improving
the rereference interval prediction. On the other hand,
inclusive SLLCs are commonplace in the CMP market, but
the interaction between replacement policy and the
enforcement of inclusion has barely been discussed.
After analyzing that interaction, this article
introduces two simple replacement policies exploiting
reuse locality and targeting inclusive SLLCs: Least
Recently Reused (LRR) and Not Recently Reused (NRR).
NRR has the same implementation cost as NRU, and LRR
only adds one bit per line to the LRU cost. After
considering reuse locality and its interaction with the
invalidations induced by inclusion, the proposals are
evaluated by simulating multiprogrammed workloads in an
8-core system with two private cache levels and an
SLLC. LRR outperforms LRU by 4.5\% (performing better
in 97 out of 100 mixes) and NRR outperforms NRU by
4.2\% (performing better in 99 out of 100 mixes). We
also show that our mechanisms outperform rereference
interval prediction, a recently proposed SLLC
replacement policy and that similar conclusions can be
drawn by varying the associativity or the SLLC size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yiapanis:2013:OSR,
author = "Paraskevas Yiapanis and Demian Rosas-Ham and Gavin
Brown and Mikel Luj{\'a}n",
title = "Optimizing software runtime systems for speculative
parallelization",
journal = j-TACO,
volume = "9",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400698",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Thread-Level Speculation (TLS) overcomes limitations
intrinsic with conservative compile-time
auto-parallelizing tools by extracting parallel threads
optimistically and only ensuring absence of data
dependence violations at runtime. A significant barrier
for adopting TLS (implemented in software) is the
overheads associated with maintaining speculative
state. Based on previous TLS limit studies, we observe
that on future multicore systems we will likely have
more cores idle than those which traditional TLS would
be able to harness. This implies that a TLS system
should focus on optimizing for small number of cores
and find efficient ways to take advantage of the idle
cores. Furthermore, research on optimistic systems has
covered two important implementation design points:
eager vs. lazy version management. With this knowledge,
we propose new simple and effective techniques to
reduce the execution time overheads for both of these
design points. This article describes a novel compact
version management data structure optimized for space
overhead when using a small number of TLS threads.
Furthermore, we describe two novel software runtime
parallelization systems that utilize this compact data
structure. The first software TLS system, MiniTLS,
relies on eager memory data management (in-place
updates) and, thus, when a misspeculation occurs a
rollback process is required. MiniTLS takes advantage
of the novel compact version management representation
to parallelize the rollback process and is able to
recover from misspeculation faster than existing
software eager TLS systems. The second one, Lector
(Lazy inspECTOR) is based on lazy version management.
Since we have idle cores, the question is whether we
can create ``helper'' tasks to determine whether
speculation is actually needed without stopping or
damaging the speculative execution. In Lector, for each
conventional TLS thread running speculatively with lazy
version management, there is associated with it a
lightweight inspector. The inspector threads execute
alongside to verify quickly whether data dependencies
will occur. Inspector threads are generated by standard
techniques for inspector/executor parallelization. We
have applied both TLS systems to seven Java sequential
benchmarks, including three benchmarks from
SPECjvm2008. Two out of the seven benchmarks exhibit
misspeculations. MiniTLS experiments report average
speedups of 1.8x for 4 threads increasing close to 7x
speedups with 32 threads. Facilitated by our novel
compact representation, MiniTLS reduces the space
overhead over state-of-the-art software TLS systems
between 96\% on 2 threads and 40\% on 32 threads. The
experiments for Lector, report average speedups of 1.7x
for 2 threads (that is 1 TLS + 1 Inspector threads)
increasing close to 8.2x speedups with 32 threads (16 +
16 threads). Compared to a well established software
TLS baseline, Lector performs on average 1.7x faster
for 32 threads and in no case ( x TLS + x Inspector
threads) Lector delivers worse performance than the
baseline TLS with the equivalent number of TLS threads
(i.e. x TLS threads) nor doubling the equivalent number
of TLS threads (i.e., x + x TLS threads).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nugteren:2013:ASC,
author = "Cedric Nugteren and Pieter Custers and Henk
Corporaal",
title = "Algorithmic species: a classification of affine loop
nests for parallel programming",
journal = j-TACO,
volume = "9",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400699",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Code generation and programming have become ever more
challenging over the last decade due to the shift
towards parallel processing. Emerging processor
architectures such as multi-cores and GPUs exploit
increasingly parallelism, requiring programmers and
compilers to deal with aspects such as threading,
concurrency, synchronization, and complex memory
partitioning. We advocate that programmers and
compilers can greatly benefit from a structured
classification of program code. Such a classification
can help programmers to find opportunities for
parallelization, reason about their code, and interact
with other programmers. Similarly, parallelising
compilers and source-to-source compilers can take
threading and optimization decisions based on the same
classification. In this work, we introduce algorithmic
species, a classification of affine loop nests based on
the polyhedral model and targeted for both automatic
and manual use. Individual classes capture information
such as the structure of parallelism and the data
reuse. To make the classification applicable for manual
use, a basic vocabulary forms the base for the creation
of a set of intuitive classes. To demonstrate the use
of algorithmic species, we identify 115 classes in a
benchmark set. Additionally, we demonstrate the
suitability of algorithmic species for automated uses
by showing a tool to automatically extract species from
program code, a species-based source-to-source
compiler, and a species-based performance prediction
model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gerards:2013:ODD,
author = "Marco E. T. Gerards and Jan Kuper",
title = "Optimal {DPM} and {DVFS} for frame-based real-time
systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400700",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic Power Management (DPM) and Dynamic Voltage and
Frequency Scaling (DVFS) are popular techniques for
reducing energy consumption. Algorithms for optimal
DVFS exist, but optimal DPM and the optimal combination
of DVFS and DPM are not yet solved. In this article we
use well-established models of DPM and DVFS for
frame-based systems. We show that it is not
sufficient-as some authors argue-to consider only
individual invocations of a task. We define a schedule
that also takes interactions between invocations into
account and prove-in a theoretical fashion-that this
schedule is optimal.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yan:2013:IPA,
author = "Zhichao Yan and Hong Jiang and Yujuan Tan and Dan
Feng",
title = "An integrated pseudo-associativity and relaxed-order
approach to hardware transactional memory",
journal = j-TACO,
volume = "9",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400701",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Our experimental study and analysis reveal that the
bottlenecks of existing hardware transactional memory
systems are largely rooted in the extra data movements
in version management and in the inefficient scheduling
of conflicting transactions in conflict management,
particularly in the presence of high-contention and
coarse-grained applications. In order to address this
problem, we propose an integrated Pseudo-Associativity
and Relaxed-Order approach to hardware Transactional
Memory, called PARO-TM. It exploits the extra
pseudo-associative space in the data cache to hold the
new value of each transactional modification, and
maintains the mappings between the old and new versions
via an implicit pseudo-associative hash algorithm
(i.e., by inverting the specific bit of the SET index).
PARO-TM can branch out the speculative version from the
old version upon each transactional modification on
demand without a dedicated hardware component to hold
the uncommitted data. This means that it is able to
automatically access the proper version upon the
transaction's commit or abort. Moreover, PARO-TM
augments multi-version support in a chained directory
to schedule conflicting transactions in a relaxed-order
manner to further reduce their overheads. We compare
PARO-TM with the state-of-the-art LogTM-SE, TCC, DynTM,
and SUV-TM systems and find that PARO-TM consistently
outperforms these four representative HTMs. This
performance advantage of PARO-TM is far more pronounced
under the high-contention and coarse-grained
applications in the STAMP benchmark suite, for which
PARO-TM is motivated and designed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:PGF,
author = "Doris Chen and Deshanand Singh",
title = "Profile-guided floating- to fixed-point conversion for
hybrid {FPGA}-processor applications",
journal = j-TACO,
volume = "9",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400702",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The key to enabling widespread use of FPGAs for
algorithm acceleration is to allow programmers to
create efficient designs without the time-consuming
hardware design process. Programmers are used to
developing scientific and mathematical algorithms in
high-level languages (C/C++) using floating point data
types. Although easy to implement, the dynamic range
provided by floating point is not necessary in many
applications; more efficient implementations can be
realized using fixed point arithmetic. While this topic
has been studied previously [Han et al. 2006; Olson et
al. 1999; Gaffar et al. 2004; Aamodt and Chow 1999],
the degree of full automation has always been lacking.
We present a novel design flow for cases where FPGAs
are used to offload computations from a microprocessor.
Our LLVM-based algorithm inserts value profiling code
into an unmodified C/C++ application to guide its
automatic conversion to fixed point. This allows for
fast and accurate design space exploration on a host
microprocessor before any accelerators are mapped to
the FPGA. Through experimental results, we demonstrate
that fixed-point conversion can yield resource savings
of up to 2x--3x reductions. Embedded RAM usage is
minimized, and 13\%--22\% higher $ F_{\rm max} $ than
the original floating-point implementation is observed.
In a case study, we show that 17\% reduction in logic
and 24\% reduction in register usage can be realized by
using our algorithm in conjunction with a High-Level
Synthesis (HLS) tool.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cui:2013:LCA,
author = "Yan Cui and Yingxin Wang and Yu Chen and Yuanchun
Shi",
title = "Lock-contention-aware scheduler: a scalable and
energy-efficient method for addressing scalability
collapse on multicore systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400703",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In response to the increasing ubiquity of multicore
processors, there has been widespread development of
multithreaded applications that strive to realize their
full potential. Unfortunately, lock contention within
operating systems can limit the scalability of
multicore systems so severely that an increase in the
number of cores can actually lead to reduced
performance (i.e., scalability collapse). Existing
efforts of solving scalability collapse mainly focus on
making critical sections of kernel code fine-grained or
designing new synchronization primitives. However,
these methods have disadvantages in scalability or
energy efficiency. In this article, we observe that the
percentage of lock-waiting time over the total
execution time for a lock intensive task has a
significant correlation with the occurrence of
scalability collapse. Based on this observation, a
lock-contention-aware scheduler is proposed.
Specifically, each task in the scheduler monitors its
percentage of lock waiting time continuously. If the
percentage exceeds a predefined threshold, this task is
considered as lock intensive and migrated to a Special
Set of Cores (i.e., SSC). In this way, the number of
concurrently running lock-intensive tasks is limited to
the number of cores in the SSC, and therefore, the
degree of lock contention is controlled. A central
challenge of using this scheme is how many cores should
be allocated in the SSC to handle lock-intensive tasks.
In our scheduler, the optimal number of cores is
determined online by the model-driven search. The
proposed scheduler is implemented in the recent Linux
kernel and evaluated using micro- and macrobenchmarks
on AMD and Intel 32-core systems. Experimental results
suggest that our proposal is able to remove scalability
collapse completely and sustains the maximal throughput
of the spin-lock-based system for most applications.
Furthermore, the percentage of lock-waiting time can be
reduced by up to 84\%. When compared with scalability
collapse reduction methods such as requester-based
locking scheme and sleeping-based synchronization
primitives, our scheme exhibits significant advantages
in scalability, power consumption, and energy
efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pusukuri:2013:AFC,
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
Bhuyan",
title = "{ADAPT}: a framework for coscheduling multithreaded
programs",
journal = j-TACO,
volume = "9",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400704",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Since multicore systems offer greater performance via
parallelism, future computing is progressing towards
use of multicore machines with large number of cores.
However, the performance of emerging multithreaded
programs often does not scale to fully utilize the
available cores. Therefore, simultaneously running
multiple multithreaded applications becomes inevitable
to fully exploit the computing potential of such
machines. However, maximizing the performance and
throughput on multicore machines in the presence of
multiple multithreaded programs is a challenge for the
OS. We have observed that the state-of-the-art
contention management algorithms fail to effectively
coschedule multithreaded programs on multicore
machines. To address the above challenge, we present
ADAPT, a scheduling framework that continuously
monitors the resource usage of multithreaded programs
and adaptively coschedules them such that they
interfere with each other's performance as little as
possible. In addition, ADAPT selects appropriate memory
allocation and scheduling policies according to the
workload characteristics. We have implemented ADAPT on
a 64-core Supermicro server running Solaris 11 and
evaluated it using 26 multithreaded programs including
the TATP database application, SPECjbb2005, and
programs from Phoenix, PARSEC, and SPEC OMP suites. The
experimental results show that ADAPT substantially
improves total turnaround time and system utilization
relative to the default Solaris 11 scheduler.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tartara:2013:CLC,
author = "Michele Tartara and Stefano Crespi Reghizzi",
title = "Continuous learning of compiler heuristics",
journal = j-TACO,
volume = "9",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400705",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Optimizing programs to exploit the underlying hardware
architecture is an important task. Much research has
been done on enabling compilers to find the best set of
code optimizations that can build the fastest and less
resource-hungry executable for a given program. A
common approach is iterative compilation, sometimes
enriched by machine learning techniques. This provides
good results, but requires extremely long compilation
times and an initial training phase lasting even for
days or weeks. We present long-term learning, a new
algorithm that allows the compiler user to improve the
performance of compiled programs with reduced
compilation times with respect to iterative
compilation, and without an initial training phase. Our
algorithm does not just build good programs: it
acquires knowledge every time a program is compiled and
it uses such knowledge to learn compiler heuristics,
without the need for an expert to manually define them.
The heuristics are evolved during every compilation, by
evaluating their effect on the generated programs. We
present implementations of long-term learning on top of
two different compilers, and experimental data gathered
on multiple hardware configurations showing its
effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chrysos:2013:HCP,
author = "Grigorios Chrysos and Panagiotis Dagritzikos and
Ioannis Papaefstathiou and Apostolos Dollas",
title = "{HC-CART}: a parallel system implementation of data
mining classification and regression tree {(CART)}
algorithm on a multi-{FPGA} system",
journal = j-TACO,
volume = "9",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400706",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Data mining is a new field of computer science with a
wide range of applications. Its goal is to extract
knowledge from massive datasets in a
human-understandable structure, for example, the
decision trees. In this article we present an
innovative, high-performance, system-level architecture
for the Classification And Regression Tree (CART)
algorithm, one of the most important and widely used
algorithms in the data mining area. Our proposed
architecture exploits parallelism at the decision
variable level, and was fully implemented and evaluated
on a modern high-performance reconfigurable platform,
the Convey HC-1 server, that features four FPGAs and a
multicore processor. Our FPGA-based implementation was
integrated with the widely used ``rpart'' software
library of the R project in order to provide the first
fully functional reconfigurable system that can handle
real-world large databases. The proposed system, named
HC-CART system, achieves a performance speedup of up to
two orders of magnitude compared to well-known
single-threaded data mining software platforms, such as
WEKA and the R platform. It also outperforms similar
hardware systems which implement parts of the complete
application by an order of magnitude. Finally, we show
that the HC-CART system offers higher performance
speedup than some other proposed parallel software
implementations of decision tree construction
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2013:DCD,
author = "Jongwon Lee and Yohan Ko and Kyoungwoo Lee and Jonghee
M. Youn and Yunheung Paek",
title = "Dynamic code duplication with vulnerability awareness
for soft error detection on {VLIW} architectures",
journal = j-TACO,
volume = "9",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400707",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Soft errors are becoming a critical concern in
embedded system designs. Code duplication techniques
have been proposed to increase the reliability in
multi-issue embedded systems such as VLIW by exploiting
empty slots for duplicated instructions. However, they
increase code size, another important concern, and
ignore vulnerability differences in instructions,
causing unnecessary or inefficient protection when
selecting instructions to be duplicated under
constraints. In this article, we propose a
compiler-assisted dynamic code duplication method to
minimize the code size overhead, and present
vulnerability-aware duplication algorithms to maximize
the effectiveness of instruction duplication with least
overheads for VLIW architecture. Our experimental
results with SoarGen and Synopsys simulation
environments demonstrate that our proposals can reduce
the code size by up to 40\% and detect more soft errors
by up to 82\% via fault injection experiments over
benchmarks from DSPstone and Livermore Loops as
compared to the previously proposed instruction
duplication technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Coelho:2013:ACI,
author = "Fabien Coelho and Fran{\c{c}}ois Irigoin",
title = "{API} compilation for image hardware accelerators",
journal = j-TACO,
volume = "9",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400708",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present an API-based compilation strategy to
optimize image applications, developed using a
high-level image processing library, onto three
different image processing hardware accelerators. We
demonstrate that such a strategy is profitable for both
development cost and overall performance, especially as
it takes advantage of optimization opportunities across
library calls otherwise beyond reach. The library API
provides the semantics of the image computations. The
three image accelerator targets are quite distinct: the
first one uses a vector architecture; the second one
presents an SIMD architecture; the last one runs both
on GPGPU and multicores through OpenCL. We have adapted
standard compilation techniques to perform these
compilation and code generation tasks automatically.
Our strategy is implemented in PIPS, a source-to-source
compiler which greatly reduces the development cost as
standard phases are reused and parameterized. We
carried out experiments with applications on hardware
functional simulators and GPUs. Our contributions
include: (1) a general low-cost compilation strategy
for image processing applications, based on the
semantics provided by library calls, which improves
locality by an order of magnitude; (2) specific
heuristics to minimize execution time on the target
accelerators; (3) numerous experiments that show the
effectiveness of our strategies. We also discuss the
conditions required to extend this approach to other
application domains.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luque:2013:FCT,
author = "Carlos Luque and Miquel Moreto and Francisco J.
Cazorla and Mateo Valero",
title = "Fair {CPU} time accounting in {CMP+SMT} processors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400709",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Processor architectures combining several paradigms of
Thread-Level Parallelism (TLP), such as CMP processors
in which each core is SMT, are becoming more and more
popular as a way to improve performance at a moderate
cost. However, the complex interaction between running
tasks in hardware shared resources in multi-TLP
architectures introduces complexities when accounting
CPU time (or CPU utilization) to tasks. The CPU
utilization accounted to a task depends on both the
time it runs in the processor and the amount of
processor hardware resources it receives. Deploying
systems with accurate CPU accounting mechanisms is
necessary to increase fairness. Moreover, it will allow
users to be fairly charged on a shared data center,
facilitating server consolidation in future systems. In
this article we analyze the accuracy and hardware cost
of previous CPU accounting mechanisms for pure-CMP and
pure-SMT processors and we show that they are not
adequate for CMP+SMT processors. Consequently, we
propose a new accounting mechanism for CMP+SMT
processors which: (1) increases the accuracy of
accounted CPU utilization; (2) provides much more
stable results over a wide range of processor setups;
and (3) does not require tracking all hardware shared
resources, significantly reducing its implementation
cost. In particular, previous proposals lead to
inaccuracies between 21\% and 79\% when measuring CPU
utilization in an 8-core 2-way SMT processor, while our
proposal reduces this inaccuracy to less than 5.0\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mattheakis:2013:SRM,
author = "Pavlos M. Mattheakis and Ioannis Papaefstathiou",
title = "Significantly reducing {MPI} intercommunication
latency and power overhead in both embedded and {HPC}
systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400710",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Highly parallel systems are becoming mainstream in a
wide range of sectors ranging from their traditional
stronghold high-performance computing, to data centers
and even embedded systems. However, despite the quantum
leaps of improvements in cost and performance of
individual components over the last decade (e.g.,
processor speeds, memory/interconnection bandwidth,
etc.), system manufacturers are still struggling to
deliver low-latency, highly scalable solutions. One of
the main reasons is that the intercommunication latency
grows significantly with the number of processor nodes.
This article presents a novel way to reduce this
intercommunication delay by implementing, in custom
hardware, certain communication tasks. In particular,
the proposed novel device implements the two most
widely used procedures of the most popular
communication protocol in parallel systems the Message
Passing Interface (MPI). Our novel approach has
initially been simulated within a pioneering parallel
systems simulation framework and then synthesized
directly from a high-level description language (i.e.,
SystemC) using a state-of-the-art synthesis tool. To
the best of our knowledge, this is the first article
presenting the complete hardware implementation of such
a system. The proposed novel approach triggers a
speedup from one to four orders of magnitude when
compared with conventional software-based solutions and
from one to three orders of magnitude when compared
with a sophisticated software-based approach. Moreover,
the performance of our system is from one to two orders
of magnitude higher than the simulated performance of a
similar but, relatively simpler hardware architecture;
at the same time the power consumption of our device is
about two orders of magnitude lower than that of a
low-power CPU when executing the exact same
intercommunication tasks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Baghdadi:2013:ILT,
author = "Riyadh Baghdadi and Albert Cohen and Sven Verdoolaege
and Konrad Trifunovi{\'c}",
title = "Improved loop tiling based on the removal of spurious
false dependences",
journal = j-TACO,
volume = "9",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400711",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To preserve the validity of loop nest transformations
and parallelization, data dependences need to be
analyzed. Memory dependences come in two varieties:
true dependences or false dependences. While true
dependences must be satisfied in order to preserve the
correct order of computations, false dependences are
induced by the reuse of a single memory location to
store multiple values. False dependences reduce the
degrees of freedom for loop transformations. In
particular, loop tiling is severely limited in the
presence of these dependences. While array expansion
removes all false dependences, the overhead on memory
and the detrimental impact on register-level reuse can
be catastrophic. We propose and evaluate a compilation
technique to safely ignore a large number of false
dependences in order to enable loop nest tiling in the
polyhedral model. It is based on the precise
characterization of interferences between live range
intervals, and it does not incur any scalar or array
expansion. Our algorithms have been implemented in the
Pluto polyhedral compiler, and evaluated on the
PolyBench suite.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pop:2013:OED,
author = "Antoniu Pop and Albert Cohen",
title = "{OpenStream}: Expressiveness and data-flow compilation
of {OpenMP} streaming programs",
journal = j-TACO,
volume = "9",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present OpenStream, a data-flow extension of OpenMP
to express dynamic dependent tasks. The language
supports nested task creation, modular composition,
variable and unbounded sets of producers/consumers, and
first-class streams. These features, enabled by our
original compilation flow, allow translating high-level
parallel programming patterns, like dependences arising
from StarSs' array regions, or universal low-level
primitives like futures. In particular, these dynamic
features can be embedded efficiently and naturally into
an unmanaged imperative language, avoiding the
complexity and overhead of a concurrent garbage
collector. We demonstrate the performance advantages of
a data-flow execution model compared to more restricted
task and barrier models. We also demonstrate the
efficiency of our compilation and runtime algorithms
for the support of complex dependence patterns arising
from StarSs benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Verdoolaege:2013:PPC,
author = "Sven Verdoolaege and Juan Carlos Juega and Albert
Cohen and Jos{\'e} Ignacio G{\'o}mez and Christian
Tenllado and Francky Catthoor",
title = "Polyhedral parallel code generation for {CUDA}",
journal = j-TACO,
volume = "9",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article addresses the compilation of a sequential
program for parallel execution on a modern GPU. To this
end, we present a novel source-to-source compiler
called PPCG. PPCG singles out for its ability to
accelerate computations from any static control loop
nest, generating multiple CUDA kernels when necessary.
We introduce a multilevel tiling strategy and a code
generation scheme for the parallelization and locality
optimization of imperfectly nested loops, managing
memory and exposing concurrency according to the
constraints of modern GPUs. We evaluate our algorithms
and tool on the entire PolyBench suite.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Du:2013:DCC,
author = "Yu Du and Miao Zhou and Bruce Childers and Rami Melhem
and Daniel Moss{\'e}",
title = "Delta-compressed caching for overcoming the write
bandwidth limitation of hybrid main memory",
journal = j-TACO,
volume = "9",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Limited PCM write bandwidth is a critical obstacle to
achieve good performance from hybrid DRAM/PCM memory
systems. The write bandwidth is severely restricted in
PCM devices, which harms application performance.
Indeed, as we show, it is more important to reduce PCM
write traffic than to reduce PCM read latency for
application performance. To reduce the number of PCM
writes, we propose a DRAM cache organization that
employs compression. A new delta compression technique
for modified data is used to achieve a large
compression ratio. Our approach can selectively and
predictively apply compression to improve its
efficiency and performance. Our approach is designed to
facilitate adoption in existing main memory compression
frameworks. We describe an instance of how to
incorporate delta compression in IBM's MXT memory
compression architecture when used for DRAM cache in a
hybrid main memory. For fourteen representative
memory-intensive workloads, on average, our delta
compression technique reduces the number of PCM writes
by 54.3\%, and improves IPC performance by 24.4\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Purini:2013:FGO,
author = "Suresh Purini and Lakshya Jain",
title = "Finding good optimization sequences covering program
space",
journal = j-TACO,
volume = "9",
number = "4",
pages = "56:1--56:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400715",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The compiler optimizations we enable and the order in
which we apply them on a program have a substantial
impact on the program execution time. Compilers provide
default optimization sequences which can give good
program speedup. As the default sequences have to
optimize programs with different characteristics, they
embed in them multiple subsequences which can optimize
different classes of programs. These multiple
subsequences may falsely interact with each other and
affect the potential program speedup achievable.
Instead of searching for a single universally optimal
sequence, we can construct a small set of good
sequences such that for every program class there
exists a near-optimal optimization sequence in the good
sequences set. If we can construct such a good
sequences set which covers all the program classes in
the program space, then we can choose the best sequence
for a program by trying all the sequences in the good
sequences set. This approach completely circumvents the
need to solve the program classification problem. Using
a sequence set size of around 10 we got an average
speedup up to 14\% on PolyBench programs and up to 12\%
on MiBench programs. Our approach is quite different
from either the iterative compilation or
machine-learning-based prediction modeling techniques
proposed in the literature so far. We use different
training and test datasets for cross-validation as
against the Leave-One-Out cross-validation technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Belviranli:2013:DSS,
author = "Mehmet E. Belviranli and Laxmi N. Bhuyan and Rajiv
Gupta",
title = "A dynamic self-scheduling scheme for heterogeneous
multiprocessor architectures",
journal = j-TACO,
volume = "9",
number = "4",
pages = "57:1--57:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400716",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Today's heterogeneous architectures bring together
multiple general-purpose CPUs and multiple
domain-specific GPUs and FPGAs to provide dramatic
speedup for many applications. However, the challenge
lies in utilizing these heterogeneous processors to
optimize overall application performance by minimizing
workload completion time. Operating system and
application development for these systems is in their
infancy. In this article, we propose a new scheduling
and workload balancing scheme, HDSS, for execution of
loops having dependent or independent iterations on
heterogeneous multiprocessor systems. The new algorithm
dynamically learns the computational power of each
processor during an adaptive phase and then schedules
the remainder of the workload using a weighted
self-scheduling scheme during the completion phase.
Different from previous studies, our scheme uniquely
considers the runtime effects of block sizes on the
performance for heterogeneous multiprocessors. It finds
the right trade-off between large and small block sizes
to maintain balanced workload while keeping the
accelerator utilization at maximum. Our algorithm does
not require offline training or architecture-specific
parameters. We have evaluated our scheme on two
different heterogeneous architectures: AMD 64-core
Bulldozer system with nVidia Fermi C2050 GPU and Intel
Xeon 32-core SGI Altix 4700 supercomputer with Xilinx
Virtex 4 FPGAs. The experimental results show that our
new scheduling algorithm can achieve performance
improvements up to over 200\% when compared to the
closest existing load balancing scheme. Our algorithm
also achieves full processor utilization with all
processors completing at nearly the same time which is
significantly better than alternative current
approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Negi:2013:SCF,
author = "Anurag Negi and Ruben Titos-Gil",
title = "{SCIN-cache}: Fast speculative versioning in
multithreaded cores",
journal = j-TACO,
volume = "9",
number = "4",
pages = "58:1--58:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes cache designs for efficiently
supporting speculative techniques like transactional
memory on chip multiprocessors with multithreaded
cores. On-demand allocation and prompt freeing of
speculative cache space in the design reduces the
burden on nonspeculative execution. Quick access to
both clean and speculative versions of data for
multiple contexts provides flexibility and greater
design freedom to HTM architects. Performance analysis
shows the designs stand up well against other HTM
design proposals, with potential performance gains in
high contention applications with small transactions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lutz:2013:PAF,
author = "Thibaut Lutz and Christian Fensch and Murray Cole",
title = "{PARTANS}: an autotuning framework for stencil
computation on multi-{GPU} systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400718",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPGPUs are a powerful and energy-efficient solution
for many problems. For higher performance or larger
problems, it is necessary to distribute the problem
across multiple GPUs, increasing the already high
programming complexity. In this article, we focus on
abstracting the complexity of multi-GPU programming for
stencil computation. We show that the best strategy
depends not only on the stencil operator, problem size,
and GPU, but also on the PCI express layout. This adds
nonuniform characteristics to a seemingly homogeneous
setup, causing up to 23\% performance loss. We address
this issue with an autotuner that optimizes the
distribution across multiple GPUs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xiao:2013:SAT,
author = "Chunhua Xiao and M-C. Frank Chang and Jason Cong and
Michael Gill and Zhangqin Huang and Chunyue Liu and
Glenn Reinman and Hao Wu",
title = "Stream arbitration: Towards efficient bandwidth
utilization for emerging on-chip interconnects",
journal = j-TACO,
volume = "9",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400719",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Alternative interconnects are attractive for scaling
on-chip communication bandwidth in a power-efficient
manner. However, efficient utilization of the bandwidth
provided by these emerging interconnects still remains
an open problem due to the spatial and temporal
communication heterogeneity. In this article, a Stream
Arbitration scheme is proposed, where at runtime any
source can compete for any communication channel of the
interconnect to talk to any destination. We apply
stream arbitration to radio frequency interconnect
(RF-I). Experimental results show that compared to the
representative token arbitration scheme, stream
arbitration can provide an average 20\% performance
improvement and 12\% power reduction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:DRU,
author = "Yunji Chen and Tianshi Chen and Ling Li and Ruiyang Wu
and Daofu Liu and Weiwu Hu",
title = "Deterministic Replay Using Global Clock",
journal = j-TACO,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445573",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Debugging parallel programs is a well-known difficult
problem. A promising method to facilitate debugging
parallel programs is using hardware support to achieve
deterministic replay on a Chip Multi-Processor (CMP).
As a Design-For-Debug (DFD) feature, a practical
hardware-assisted deterministic replay scheme should
have low design and verification costs, as well as a
small log size. To achieve these goals, we propose a
novel and succinct hardware-assisted deterministic
replay scheme named LReplay. The key innovation of
LReplay is that instead of recording the logical time
orders between instructions or instruction blocks as
previous investigations, LReplay is built upon
recording the pending period information infused by the
global clock. By the recorded pending period
information, about 99\% execution orders are
inferrable, implying that LReplay only needs to record
directly the residual 1\% noninferrable execution
orders in production run. The 1\% noninferrable orders
can be addressed by a simple yet cost-effective
direction prediction technique, which further reduces
the log size of LReplay. Benefiting from the preceding
innovations, the overall log size of LReplay over
SPLASH-2 benchmarks is about 0.17B/K-Inst (byte per
k-instruction) for the sequential consistency, and
0.57B/K-Inst for the Godson-3 consistency. Such log
sizes are smaller in an order of magnitude than
previous deterministic replay schemes incurring no
performance loss. Furthermore, LReplay only consumes
about 0.5\% area of the Godson-3 CMP, since it requires
only trivial modifications to existing components of
Godson-3. The features of LReplay demonstrate the
potential of integrating hardware support for
deterministic replay into future industrial
processors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lustig:2013:TIC,
author = "Daniel Lustig and Abhishek Bhattacharjee and Margaret
Martonosi",
title = "{TLB} Improvements for Chip Multiprocessors:
Inter-Core Cooperative Prefetchers and Shared
Last-Level {TLBs}",
journal = j-TACO,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445574",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Translation Lookaside Buffers (TLBs) are critical to
overall system performance. Much past research has
addressed uniprocessor TLBs, lowering access times and
miss rates. However, as Chip MultiProcessors (CMPs)
become ubiquitous, TLB design and performance must be
reevaluated. Our article begins by performing a
thorough TLB performance evaluation of sequential and
parallel benchmarks running on a real-world, modern CMP
system using hardware performance counters. This
analysis demonstrates the need for further improvement
of TLB hit rates for both classes of application, and
it also points out that the data TLB has a
significantly higher miss rate than the instruction TLB
in both cases. In response to the characterization
data, we propose and evaluate both Inter-Core
Cooperative (ICC) TLB prefetchers and Shared Last-Level
(SLL) TLBs as alternatives to the commercial norm of
private, per-core L2 TLBs. ICC prefetchers eliminate
19\% to 90\% of Data TLB (D-TLB) misses across parallel
workloads while requiring only modest changes in
hardware. SLL TLBs eliminate 7\% to 79\% of D-TLB
misses for parallel workloads and 35\% to 95\% of D-TLB
misses for multiprogrammed sequential workloads. This
corresponds to 27\% and 21\% increases in hit rates as
compared to private, per-core L2 TLBs, respectively,
and is achieved this using even more modest hardware
requirements. Because of their benefits for parallel
applications, their applicability to sequential
workloads, and their readily implementable hardware,
SLL TLBs and ICC TLB prefetchers hold great promise for
CMPs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:TME,
author = "Rong Chen and Haibo Chen",
title = "{Tiled-MapReduce}: Efficient and Flexible {MapReduce}
Processing on Multicore with Tiling",
journal = j-TACO,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445575",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The prevalence of chip multiprocessors opens
opportunities of running data-parallel applications
originally in clusters on a single machine with many
cores. MapReduce, a simple and elegant programming
model to program large-scale clusters, has recently
been shown a promising alternative to harness the
multicore platform. The differences such as memory
hierarchy and communication patterns between clusters
and multicore platforms raise new challenges to design
and implement an efficient MapReduce system on
multicore. This article argues that it is more
efficient for MapReduce to iteratively process small
chunks of data in turn than processing a large chunk of
data at a time on shared memory multicore platforms.
Based on the argument, we extend the general MapReduce
programming model with a ``tiling strategy'', called
Tiled --- MapReduce (TMR). TMR partitions a large
MapReduce job into a number of small subjobs and
iteratively processes one subjob at a time with
efficient use of resources; TMR finally merges the
results of all subjobs for output. Based on
Tiled-MapReduce, we design and implement several
optimizing techniques targeting multicore, including
the reuse of the input buffer among subjobs, a
NUCA/NUMA-aware scheduler, and pipelining a subjob's
reduce phase with the successive subjob's map phase, to
optimize the memory, cache, and CPU resources
accordingly. Further, we demonstrate that
Tiled-MapReduce supports fine-grained fault tolerance
and enables several usage scenarios such as online and
incremental computing on multicore machines.
Performance evaluation with our prototype system called
Ostrich on a 48-core machine shows that Ostrich saves
up to 87.6\% memory, causes less cache misses, and
makes more efficient use of CPU cores, resulting in a
speedup ranging from 1.86x to 3.07x over Phoenix.
Ostrich also efficiently supports fine-grained fault
tolerance, online, and incremental computing with small
performance penalty.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Becchi:2013:DTS,
author = "Michela Becchi and Patrick Crowley",
title = "{A-DFA}: a Time- and Space-Efficient {DFA} Compression
Algorithm for Fast Regular Expression Evaluation",
journal = j-TACO,
volume = "10",
number = "1",
pages = "4:1--4:26",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445576",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern network intrusion detection systems need to
perform regular expression matching at line rate in
order to detect the occurrence of critical patterns in
packet payloads. While Deterministic Finite Automata
(DFAs) allow this operation to be performed in linear
time, they may exhibit prohibitive memory requirements.
Kumar et al. [2006a] have proposed Delayed Input DFAs
(D2FAs), which provide a trade-off between the memory
requirements of the compressed DFA and the number of
states visited for each character processed, which in
turn affects the memory bandwidth required to evaluate
regular expressions. In this article we introduce
Amortized time --- bandwidth overhead DFAs ( A --- DFAs
), a general compression technique that results in at
most N ( k + 1)/ k state traversals when processing a
string of length N, k being a positive integer. In
comparison to the D2FA approach, our technique achieves
comparable levels of compression with lower provable
bounds on memory bandwidth (or greater compression for
a given bandwidth bound). Moreover, the A-DFA algorithm
has lower complexity, can be applied during DFA
creation, and is suitable for scenarios where a
compressed DFA needs to be dynamically built or
updated. Finally, we show how to combine A-DFA with
alphabet reduction and multistride DFAs, two techniques
aimed at reducing the memory space and bandwidth
requirement of DFAs, and discuss memory encoding
schemes suitable for A-DFAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2013:MFM,
author = "Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay
B. Brockman and Dean M. Tullsen and Norman P. Jouppi",
title = "The {McPAT} Framework for Multicore and Manycore
Architectures: Simultaneously Modeling Power, Area, and
Timing",
journal = j-TACO,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445577",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article introduces McPAT, an integrated power,
area, and timing modeling framework that supports
comprehensive design space exploration for multicore
and manycore processor configurations ranging from 90nm
to 22nm and beyond. At microarchitectural level, McPAT
includes models for the fundamental components of a
complete chip multiprocessor, including in-order and
out-of-order processor cores, networks-on-chip, shared
caches, and integrated system components such as memory
controllers and Ethernet controllers. At circuit level,
McPAT supports detailed modeling of critical-path
timing, area, and power. At technology level, McPAT
models timing, area, and power for the device types
forecast in the ITRS roadmap. McPAT has a flexible XML
interface to facilitate its use with many performance
simulators. Combined with a performance simulator,
McPAT enables architects to accurately quantify the
cost of new ideas and assess trade-offs of different
architectures using new metrics such as
Energy-Delay-Area2 Product (EDA2P) and
Energy-Delay-Area Product (EDAP). This article explores
the interconnect options of future manycore processors
by varying the degree of clustering over generations of
process technologies. Clustering will bring interesting
trade-offs between area and performance because the
interconnects needed to group cores into clusters incur
area overhead, but many applications can make good use
of them due to synergies from cache sharing. Combining
power, area, and timing results of McPAT with
performance simulation of PARSEC benchmarks for
manycore designs at the 22nm technology shows that
8-core clustering gives the best energy-delay product,
whereas when die area is taken into account, 4-core
clustering gives the best EDA2P and EDAP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kritikakou:2013:NOM,
author = "Angeliki Kritikakou and Francky Catthoor and George S.
Athanasiou and Vasilios Kelefouras and Costas Goutis",
title = "Near-Optimal Microprocessor and Accelerators Codesign
with Latency and Throughput Constraints",
journal = j-TACO,
volume = "10",
number = "2",
pages = "6:1--6:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459317",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A systematic methodology for near-optimal
software/hardware codesign mapping onto an FPGA
platform with microprocessor and HW accelerators is
proposed. The mapping steps deal with the
inter-organization, the foreground memory management,
and the datapath mapping. A step is described by
parameters and equations combined in a scalable
template. Mapping decisions are propagated as design
constraints to prune suboptimal options in next steps.
Several performance-area Pareto points are produced by
instantiating the parameters. To evaluate our
methodology we map a real-time bio-imaging application
and loop-dominated benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2013:HAC,
author = "Lei Jiang and Yu Du and Bo Zhao and Youtao Zhang and
Bruce R. Childers and Jun Yang",
title = "Hardware-Assisted Cooperative Integration of
Wear-Leveling and Salvaging for Phase Change Memory",
journal = j-TACO,
volume = "10",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459318",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase Change Memory (PCM) has recently emerged as a
promising memory technology. However, PCM's limited
write endurance restricts its immediate use as a
replacement for DRAM. To extend the lifetime of PCM
chips, wear-leveling and salvaging techniques have been
proposed. Wear-leveling balances write operations
across different PCM regions while salvaging extends
the duty cycle and provides graceful degradation for a
nonnegligible number of failures. Current wear-leveling
and salvaging schemes have not been designed and
integrated to work cooperatively to achieve the best
PCM device lifetime. In particular, a noncontiguous PCM
space generated from salvaging complicates
wear-leveling and incurs large overhead. In this
article, we propose LLS, a Line-Level mapping and
Salvaging design. By allocating a dynamic portion of
total space in a PCM device as backup space, and
mapping failed lines to backup PCM, LLS constructs a
contiguous PCM space and masks lower-level failures
from the OS and applications. LLS integrates
wear-leveling and salvaging and copes well with modern
OSes. Our experimental results show that LLS achieves
31\% longer lifetime than the state-of-the-art. It has
negligible hardware cost and performance overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Han:2013:PEP,
author = "Kyuseung Han and Junwhan Ahn and Kiyoung Choi",
title = "Power-Efficient Predication Techniques for
Acceleration of Control Flow Execution on {CGRA}",
journal = j-TACO,
volume = "10",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459319",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Coarse-grained reconfigurable architecture typically
has an array of processing elements which are
controlled by a centralized unit. This makes it
difficult to execute programs having control divergence
among PEs without predication. However, conventional
predication techniques have a negative impact on both
performance and power consumption due to longer
instruction words and unnecessary instruction-fetching
decoding nullifying steps. This article reveals
performance and power issues in predicated execution
which have not been well-addressed yet. Furthermore, it
proposes fast and power-efficient predication
mechanisms. Experiments conducted through gate-level
simulation show that our mechanism improves
energy-delay product by 11.9\% to 23.8\% on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:MTD,
author = "Chao Wang and Xi Li and Junneng Zhang and Xuehai Zhou
and Xiaoning Nie",
title = "{MP-Tomasulo}: a Dependency-Aware Automatic Parallel
Execution Engine for Sequential Programs",
journal = j-TACO,
volume = "10",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459320",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents MP-Tomasulo, a dependency-aware
automatic parallel task execution engine for sequential
programs. Applying the instruction-level Tomasulo
algorithm to MPSoC environments, MP-Tomasulo detects
and eliminates Write-After-Write (WAW) and
Write-After-Read (WAR) inter-task dependencies in the
dataflow execution, therefore to operate out-of-order
task execution on heterogeneous units. We implemented
the prototype system within a single FPGA. Experimental
results on EEMBC applications demonstrate that
MP-Tomasulo can execute the tasks out-of-order to
achieve as high as 93.6\% to 97.6\% of ideal peak
speedup. A comparative study against a state-of-the-art
dataflow execution scheme is illustrated with a classic
JPEG application. The promising results show
MP-Tomasulo enables programmers to uncover more
task-level parallelism on heterogeneous systems, as
well as to ease the burden of programmers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anonymous:2013:TR,
author = "Anonymous",
title = "{TACO} Reviewers 2012",
journal = j-TACO,
volume = "10",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509421",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shifer:2013:LLA,
author = "Eran Shifer and Shlomo Weiss",
title = "Low-latency adaptive mode transitions and hierarchical
power management in asymmetric clustered cores",
journal = j-TACO,
volume = "10",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499901",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recently, engineering solutions that include
asymmetric multicores have been fabricated for low
form-factor computing devices, indicating a potential
direction for future evolution of processors. In this
article we propose an asymmetric clustered core
architecture, exhibiting low-latency switching between
modes relative to asymmetric multicores, and having
similarities with the same asymmetric multicore
architecture in the context of a wider dynamic range of
the processor power-performance characteristic.
Asymmetric clustered cores incur additional
microarchitectural complexity and area cost inside a
core but exhibit better chip-level integration
characteristics compared to asymmetric multicores.
Focusing on power efficiency of asymmetric clustered
cores, we describe: (1) a hierarchical power management
partitioning between the operating system and on-die
firmware for coarse-grain switch policies, and (2)
core-internal tracking hardware for fine-grain
switching. The mode switch policies of the core's
tracking hardware are dependent on higher-level
directives and hints from the operating system, on-die
firmware, and compiler or profiling software. We
further explore the potential power management benefits
of asymmetric clustered cores relative to asymmetric
multicores, demonstrating that the ability of
asymmetric clustered cores to use tight training
periods for adaptive behavior, with low overhead
switching between modes, results in a more efficient
utilization of power management directives.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{BenAsher:2013:HTL,
author = "Yosi {Ben Asher} and Nadav Rotem",
title = "Hybrid type legalization for a sparse {SIMD}
instruction set",
journal = j-TACO,
volume = "10",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509422",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "SIMD vector units implement only a subset of the
operations used by vectorizing compilers, and there are
multiple conflicting techniques to legalize arbitrary
vector types into register-sized data types.
Traditionally, type legalization is performed using a
set of predefined rules, regardless of the operations
used in the program. This method is not suitable to
sparse SIMD instruction sets and often prevents the
vectorization of programs. In this work we introduce a
new technique for type legalization, namely vector
element promotion, as well as a hybrid method for
combining multiple techniques of type legalization. Our
hybrid type legalization method makes decisions based
on the knowledge of the available instruction set as
well as the operations used in the program. Our
experimental results demonstrate that program-dependent
hybrid type legalization improves the execution time of
vector programs, outperforms the existing legalization
method, and allows the vectorization of workloads which
were not vectorized before.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lei:2013:VCI,
author = "Yuanwu Lei and Yong Dou and Lei Guo and Jinbo Xu and
Jie Zhou and Yazhuo Dong and Hongjian Li",
title = "{VLIW} coprocessor for {IEEE-754} quadruple-precision
elementary functions",
journal = j-TACO,
volume = "10",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512430",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, a unified VLIW coprocessor, based on
a common group of atomic operation units, for Quad
arithmetic and elementary functions (QP\_VELP) is
presented. The explicitly parallel scheme of VLIW
instruction and Estrin's evaluation scheme for
polynomials are used to improve the performance. A
two-level VLIW instruction RAM scheme is introduced to
achieve high scalability and customizability, even for
more complex key program kernels. Finally, the Quad
arithmetic accelerator (QAA) with the QP\_VELP array is
implemented on ASIC. Compared with hyper-thread
software implementation on an Intel Xeon E5620, QAA
with 8 QP\_VELP units achieves improvement by a factor
of 18X.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kawahito:2013:IRF,
author = "Motohiro Kawahito and Hideaki Komatsu and Takao
Moriyama and Hiroshi Inoue and Toshio Nakatani",
title = "Idiom recognition framework using topological
embedding",
journal = j-TACO,
volume = "10",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512431",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Modern processors support hardware-assist instructions
(such as TRT and TROT instructions on the IBM System z)
to accelerate certain functions such as delimiter
search and character conversion. Such special
instructions are often used in high-performance
libraries, but their exploitation in optimizing
compilers has been limited. We devised a new idiom
recognition technique based on a topological embedding
algorithm to detect idiom patterns in the input
programs more aggressively than in previous approaches
using exact pattern matching. Our approach can detect a
pattern even if the code segment does not exactly match
the idiom. For example, we can detect a code segment
that includes additional code within the idiom pattern.
We also propose an instruction simplification for the
idiom recognition. This optimization analyzes all of
the usages of the output of the optimized code for a
specific idiom. If we find that we do not need an
actual value for the output but only a value in a
subrange, then we can assign a value in that subrange
as the output. The code generation can generate faster
code with this optimization. We implemented our new
idiom recognition approach based on the Java
Just-In-Time (JIT) compiler that is part of the J9 Java
Virtual Machine, and we supported several important
idioms for the special hardware-assist instructions on
the IBM System z and on some models of the IBM System
p. To demonstrate the effectiveness of our technique,
we performed two experiments. The first experiment was
to see how many more patterns we can detect compared to
the previous approach. The second experiment measured
the performance improvements over the previous
approaches. For the first experiment, we used the Java
Compatibility Kit (JCK) API tests. For the second
experiment we used the IBM XML parser, SPECjvm98, and
SPCjbb2000. In summary, relative to a baseline
implementation using exact pattern matching, our
algorithm converted 76\% more loops in JCK tests. On a
z9, we also observed significant average performance
improvement of the XML parser by 54\%, of SPECjvm98 by
1.9\%, and of SPECjbb2000 by 4.4\%. Finally, we
observed that the JIT compilation time increased by
only 0.32\% to 0.44\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shobaki:2013:PIS,
author = "Ghassan Shobaki and Maxim Shawabkeh and Najm Eldeen
Abu Rmaileh",
title = "Preallocation instruction scheduling with register
pressure minimization using a combinatorial
optimization approach",
journal = j-TACO,
volume = "10",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512432",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Balancing Instruction-Level Parallelism (ILP) and
register pressure during preallocation instruction
scheduling is a fundamentally important problem in code
generation and optimization. The problem is known to be
NP-complete. Many heuristic techniques have been
proposed to solve this problem. However, due to the
inherently conflicting requirements of maximizing ILP
and minimizing register pressure, heuristic techniques
may produce poor schedules in many cases. If such cases
occur in hot code, significant performance degradation
may result. A few combinatorial optimization approaches
have also been proposed, but none of them has been
shown to solve large real-world instances within
reasonable time. This article presents the first
combinatorial algorithm that is efficient enough to
optimally solve large instances of this problem (basic
blocks with hundreds of instructions) within a few
seconds per instance. The proposed algorithm uses
branch-and-bound enumeration with a number of powerful
pruning techniques to efficiently search the solution
space. The search is based on a cost function that
incorporates schedule length and register pressure. An
implementation of the proposed scheduling algorithm has
been integrated into the LLVM Compiler and evaluated
using SPEC CPU 2006. On x86-64, with a time limit of
10ms per instruction, it optimally schedules 79\% of
the hot basic blocks in FP2006. Another 19\% of the
blocks are not optimally scheduled but are improved in
cost relative to LLVM's heuristic. This improves the
execution time of some benchmarks by up to 21\%, with a
geometric-mean improvement of 2.4\% across the entire
benchmark suite. With the use of precise latency
information, the geometric-mean improvement is
increased to 2.8\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{She:2013:EEM,
author = "Dongrui She and Yifan He and Henk Corporaal",
title = "An energy-efficient method of supporting flexible
special instructions in an embedded processor with
compact {ISA}",
journal = j-TACO,
volume = "10",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509426",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In application-specific processor design, a common
approach to improve performance and efficiency is to
use special instructions that execute complex operation
patterns. However, in a generic embedded processor with
compact Instruction Set Architecture (ISA), these
special instructions may lead to large overhead such
as: ( i ) more bits are needed to encode the extra
opcodes and operands, resulting in wider instructions;
( ii ) more Register File (RF) ports are required to
provide the extra operands to the function units. Such
overhead may increase energy consumption considerably.
In this article, we propose to support flexible
operation pair patterns in a processor with a compact
24-bit RISC-like ISA using: ( i ) a partially
reconfigurable decoder that exploits the pattern
locality to reduce opcode space requirement; ( ii ) a
software-controlled bypass network to reduce operand
encoding bit and RF port requirement. An energy-aware
compiler backend is designed for the proposed
architecture that performs pattern selection and
bypass-aware scheduling to generate energy-efficient
codes. Though the proposed design imposes extra
constraints on the operation patterns, the experimental
results show that for benchmark applications from
different domains, the average dynamic instruction
count is reduced by over 25\%, which is only about 2\%
less than the architecture without such constraints.
The proposed architecture reduces total energy by an
average of 15.8\% compared to the RISC baseline, while
the one without constraints achieves almost no
improvement due to its high overhead. When high
performance is required, the proposed architecture is
able to achieve a speedup of 13.8\% with 13.1\% energy
reduction compared to the baseline by introducing
multicycle SFU operations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nandivada:2013:IBA,
author = "V. Krishna Nandivada and Rajkishore Barik",
title = "Improved bitwidth-aware variable packing",
journal = j-TACO,
volume = "10",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509427",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Bitwidth-aware register allocation has caught the
attention of researchers aiming to effectively reduce
the number of variables spilled into memory. For
general-purpose processors, this improves the execution
time performance and reduces runtime memory
requirements (which in turn helps in the compilation of
programs targeted to systems with constrained memory).
Additionally, bitwidth-aware register allocation has
been effective in reducing power consumption in
embedded processors. One of the key components of
bitwidth-aware register allocation is the variable
packing algorithm that packs multiple narrow-width
variables into one physical register. Tallam and Gupta
[2003] have proved that optimal variable packing is an
NP-complete problem for arbitrary-width variables and
have proposed an approximate solution. In this article,
we analyze the complexity of the variable packing
problem and present three enhancements that improve the
overall packing of variables. In particular, the
improvements we describe are: (a) Width Static Single
Assignment (W-SSA) form representation that splits the
live range of a variable into several fixed-width live
ranges (W-SSA) variables; (b) PoTR Representation ---
use of powers-of-two representation for bitwidth
information for W-SSA variables. Our empirical results
have shown that the associated bit wastage resulting
from the overapproximation of the widths of variables
to the nearest next power of two is a small fraction
compared to the total number of bits in use ($ \approx
$ 13\%). The main advantage of this representation is
that it leads to optimal variable packing in polynomial
time; (c) Combined Packing and Coalescing --- we
discuss the importance of coalescing (combining
variables whose live ranges do not interfere) in the
context of variable packing and present an iterative
algorithm to perform coalescing and packing of W-SSA
variables represented in PoTR. Our experimental results
show up to 76.00\% decrease in the number of variables
compared to the number of variables in the input
program in Single Static Assignment (SSA) form. This
reduction in the number of variables led to a
significant reduction in dynamic spilling, packing, and
unpacking instructions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahn:2013:SHR,
author = "Jung Ho Ahn and Young Hoon Son and John Kim",
title = "Scalable high-radix router microarchitecture using a
network switch organization",
journal = j-TACO,
volume = "10",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512433",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the system size of supercomputers and datacenters
increases, cost-efficient networks become critical in
achieving good scalability on those systems. High
-radix routers reduce network cost by lowering the
network diameter while providing a high bisection
bandwidth and path diversity. The building blocks of
these large-scale networks are the routers or the
switches and they need to scale accordingly to the
increasing port count and increasing pin bandwidth.
However, as the port count increases, the high-radix
router microarchitecture itself needs to scale
efficiently. Hierarchical crossbar switch organization
has been proposed where a single large crossbar used
for a router switch is partitioned into many small
crossbars and overcomes the limitations of conventional
router microarchitecture. Although the organization
provides high performance, it has limited scalability
due to excessive power and area overheads by the wires
and intermediate buffers. In this article, we propose
scalable router microarchitectures that leverage a
network within the switch design of the high-radix
routers themselves. These alternative designs lower the
wiring complexity and buffer requirements. For example,
when a folded-Clos switch is used instead of the
hierarchical crossbar switch for a radix-64 router, it
provides up to 73\%, 58\%, and 87\% reduction in area,
energy-delay product, and energy-delay-area product,
respectively. We also explore more efficient switch
designs by exploiting the traffic-pattern
characteristics of the global network and its impact on
the local network design within the switch for both
folded-Clos and flattened butterfly networks. In
particular, we propose a bilateral butterfly switch
organization that has fewer crossbars and global wires
compared to the topology-agnostic folded-Clos switch
while achieving better low-load latency and equivalent
saturation throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2013:ACM,
author = "Libo Huang and Zhiying Wang and Nong Xiao and Yongwen
Wang and Qiang Dou",
title = "Adaptive communication mechanism for accelerating
{MPI} functions in {NoC}-based multicore processors",
journal = j-TACO,
volume = "10",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512434",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multicore designs have emerged as the dominant
organization for future high-performance
microprocessors. Communication in such designs is often
enabled by Networks-on-Chip (NoCs). A new trend in such
architectures is to fit a Message Passing Interface
(MPI) programming model on NoCs to achieve optimal
parallel application performance. A key issue in
designing MPI over NoCs is communication protocol,
which has not been explored in previous research. This
article advocates a hardware-supported communication
mechanism using a protocol-adaptive approach to adjust
to varying NoC configurations (e.g., number of buffers)
and workload behavior (e.g., number of messages). We
propose the ADaptive Communication Mechanism (ADCM), a
hybrid protocol that involves behavior similar to
buffered communication when sufficient buffer is
available in the receiver to that similar to a
synchronous protocol when buffers in the receiver are
limited. ADCM adapts dynamically by deciding
communication protocol on a per-request basis using a
local estimate of recent buffer utilization. ADCM
attempts to combine both the advantages of buffered and
synchronous communication modes to achieve enhanced
throughput and performance. Simulations of various
workloads show that the proposed communication
mechanism can be effectively used in future NoC
designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Malik:2013:OSG,
author = "Avinash Malik and David Gregg",
title = "Orchestrating stream graphs using model checking",
journal = j-TACO,
volume = "10",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512435",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article we use model checking to statically
distribute and schedule Synchronous DataFlow (SDF)
graphs on heterogeneous execution architectures. We
show that model checking is capable of providing an
optimal solution and it arrives at these solutions
faster (in terms of algorithm runtime) than equivalent
ILP formulations. Furthermore, we also show how
different types of optimizations such as task
parallelism, data parallelism, and state sharing can be
included within our framework. Finally, comparison of
our approach with the current state-of-the-art
heuristic techniques show the pitfalls of these
techniques and gives a glimpse of how these heuristic
techniques can be improved.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:UML,
author = "Zheng Wang and Michael F. P. O'Boyle",
title = "Using machine learning to partition streaming
programs",
journal = j-TACO,
volume = "10",
number = "3",
pages = "20:1--20:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512436",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Stream-based parallel languages are a popular way to
express parallelism in modern applications. The
efficient mapping of streaming parallelism to today's
multicore systems is, however, highly dependent on the
program and underlying architecture. We address this by
developing a portable and automatic compiler-based
approach to partitioning streaming programs using
machine learning. Our technique predicts the ideal
partition structure for a given streaming application
using prior knowledge learned offline. Using the
predictor we rapidly search the program space (without
executing any code) to generate and select a good
partition. We applied this technique to standard
StreamIt applications and compared against existing
approaches. On a 4-core platform, our approach achieves
60\% of the best performance found by iteratively
compiling and executing over 3000 different partitions
per program. We obtain, on average, a 1.90$ \times $
speedup over the already tuned partitioning scheme of
the StreamIt compiler. When compared against a
state-of-the-art analytical, model-based approach, we
achieve, on average, a 1.77$ \times $ performance
improvement. By porting our approach to an 8-core
platform, we are able to obtain 1.8$ \times $
improvement over the StreamIt default scheme,
demonstrating the portability of our approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bakhoda:2013:DCN,
author = "Ali Bakhoda and John Kim and Tor M. Aamodt",
title = "Designing on-chip networks for throughput
accelerators",
journal = j-TACO,
volume = "10",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512429",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the number of cores and threads in throughput
accelerators such as Graphics Processing Units (GPU)
increases, so does the importance of on-chip
interconnection network design. This article explores
throughput-effective Network-on-Chips (NoC) for future
compute accelerators that employ Bulk-Synchronous
Parallel (BSP) programming models such as CUDA and
OpenCL. A hardware optimization is ``throughput
effective'' if it improves parallel application-level
performance per unit chip area. We evaluate performance
of future looking workloads using detailed closed-loop
simulations modeling compute nodes, NoC, and the DRAM
memory system. We start from a mesh design with
bisection bandwidth balanced to off-chip demand.
Accelerator workloads tend to demand high off-chip
memory bandwidth which results in a many-to-few traffic
pattern when coupled with expected technology
constraints of slow growth in pins-per-chip. Leveraging
these observations we reduce NoC area by proposing a
``checkerboard'' NoC which alternates between
conventional full routers and half routers with limited
connectivity. Next, we show that increasing network
terminal bandwidth at the nodes connected to DRAM
controllers alleviates a significant fraction of the
remaining imbalance resulting from the many-to-few
traffic pattern. Furthermore, we propose a ``double
checkerboard inverted'' NoC organization which takes
advantage of channel slicing to reduce area while
maintaining the performance improvements of the
aforementioned techniques. This organization also has a
simpler routing mechanism and improves average
application throughput per unit area by 24.3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jantz:2013:ESM,
author = "Michael R. Jantz and Prasad A. Kulkarni",
title = "Exploring single and multilevel {JIT} compilation
policy for modern machines 1",
journal = j-TACO,
volume = "10",
number = "4",
pages = "22:1--22:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541229",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic or Just-in-Time (JIT) compilation is essential
to achieve high-performance emulation for programs
written in managed languages, such as Java and C\#. It
has been observed that a conservative JIT compilation
policy is most effective to obtain good runtime
performance without impeding application progress on
single-core machines. At the same time, it is often
suggested that a more aggressive dynamic compilation
strategy may perform best on modern machines that
provide abundant computing resources, especially with
virtual machines (VMs) that are also capable of
spawning multiple concurrent compiler threads. However,
comprehensive research on the best JIT compilation
policy for such modern processors and VMs is currently
lacking. The goal of this work is to explore the
properties of single-tier and multitier JIT compilation
policies that can enable existing and future VMs to
realize the best program performance on modern
machines. In this work, we design novel experiments and
implement new VM configurations to effectively control
the compiler aggressiveness and optimization levels (
if and when methods are compiled) in the
industry-standard Oracle HotSpot Java VM to achieve
this goal. We find that the best JIT compilation policy
is determined by the nature of the application and the
speed and effectiveness of the dynamic compilers. We
extend earlier results showing the suitability of
conservative JIT compilation on single-core machines
for VMs with multiple concurrent compiler threads. We
show that employing the free compilation resources
(compiler threads and hardware cores) to aggressively
compile more program methods quickly reaches a point of
diminishing returns. At the same time, we also find
that using the free resources to reduce compiler queue
backup (compile selected hot methods early )
significantly benefits program performance, especially
for slower (highly optimizing) JIT compilers. For such
compilers, we observe that accurately prioritizing JIT
method compiles is crucial to realize the most
performance benefit with the smallest hardware budget.
Finally, we show that a tiered compilation policy,
although complex to implement, greatly alleviates the
impact of more and early JIT compilation of programs on
modern machines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dong:2013:CAC,
author = "Xiangyu Dong and Norman P. Jouppi and Yuan Xie",
title = "A circuit-architecture co-optimization framework for
exploring nonvolatile memory hierarchies",
journal = j-TACO,
volume = "10",
number = "4",
pages = "23:1--23:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541230",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many new memory technologies are available for
building future energy-efficient memory hierarchies. It
is necessary to have a framework that can quickly find
the optimal memory technology at each hierarchy level.
In this work, we first build a circuit-architecture
joint design space exploration framework by combining
RC circuit analysis and Artificial Neural Network
(ANN)-based performance modeling. Then, we use this
framework to evaluate some emerging nonvolatile memory
hierarchies. We demonstrate that a Resistive RAM
(ReRAM)-based cache hierarchy on an 8-core
Chip-Multiprocessor (CMP) system can achieve a 24\%
Energy Delay Product (EDP) improvement and a 36\%
Energy Delay Area Product (EDAP) improvement compared
to a conventional hierarchy with SRAM on-chip caches
and DRAM main memory.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2013:OGE,
author = "Jishen Zhao and Guangyu Sun and Gabriel H. Loh and
Yuan Xie",
title = "Optimizing {GPU} energy efficiency with {$3$D}
die-stacking graphics memory and reconfigurable memory
interface",
journal = j-TACO,
volume = "10",
number = "4",
pages = "24:1--24:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541231",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The performance of graphics processing unit (GPU)
systems is improving rapidly to accommodate the
increasing demands of graphics and high-performance
computing applications. With such a performance
improvement, however, power consumption of GPU systems
is dramatically increased. Up to 30\% of the total
power of a GPU system is consumed by the graphic memory
itself. Therefore, reducing graphics memory power
consumption is critical to mitigate the power
challenge. In this article, we propose an
energy-efficient reconfigurable 3D die-stacking
graphics memory design that integrates wide-interface
graphics DRAMs side-by-side with a GPU processor on a
silicon interposer. The proposed architecture is a
``3D+2.5D'' system, where the DRAM memory itself is 3D
stacked memory with through-silicon via (TSV), whereas
the integration of DRAM and the GPU processor is
through the interposer solution (2.5D). Since GPU
computing units, memory controllers, and memory are all
integrated in the same package, the number of memory
I/Os is no longer constrained by the package's pin
count. We can reduce the memory power consumption by
scaling down the supply voltage and frequency of memory
interface while maintaining the same or even higher
peak memory bandwidth. In addition, we design a
reconfigurable memory interface that can dynamically
adapt to the requirements of various applications. We
propose two reconfiguration mechanisms to optimize the
GPU system energy efficiency and throughput,
respectively, and thus benefit both memory-intensive
and compute-intensive applications. The experimental
results show that the proposed GPU memory architecture
can effectively improve GPU system energy efficiency by
21\%, without reconfiguration. The reconfigurable
memory interface can further improve the system energy
efficiency by 26\%, and system throughput by 31\% under
a capped system power budget of 240W.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:EMT,
author = "Chien-Chi Chen and Sheng-De Wang",
title = "An efficient multicharacter transition string-matching
engine based on the {Aho--Corasick} algorithm",
journal = j-TACO,
volume = "10",
number = "4",
pages = "25:1--25:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541232",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A string-matching engine capable of inspecting
multiple characters in parallel can multiply the
throughput. However, the space required for
implementing a matching engine that can process
multiple characters in parallel generally grows
exponentially with respect to the characters to be
processed in parallel. Based on the Aho--Corasick
algorithm (AC-algorithm), this work presents a novel
multicharacter transition Nondeterministic Finite
Automaton (NFA) approach, called multicharacter AC-NFA,
to allow for the inspection of multiple characters in
parallel. This approach first converts an AC-trie to an
AC-NFA by allowing for the simultaneous activation of
multiple states and then converts the AC-NFA to a
$k$-character AC-NFA by an algorithm with concatenation
operations and assistant transitions. Additionally, the
alignment problem, which occurs while multiple
characters are being inspected in parallel, is solved
using assistant transitions. Moreover, a corresponding
output is provided for each inspected character by
introducing priority multiplexers to determine the
final matching outputs during implementation of the
multicharacter AC-NFA. Consequently, the number of
derived $k$-character transitions grows linearly with
respect to the number $k$. Furthermore, the derived
multicharacter AC-NFA is implemented on FPGAs for
evaluation. The resulting throughput grows
approximately 14 times and the hardware cost grows
about 18 times for 16-character AC-NFA implementation,
as compared with that for 1-character AC-NFA
implementation. The achievable throughput is 21.4Gbps
for the 16-character AC-NFA implementation operating at
a 167.36MHz clock.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2013:DIH,
author = "Yangchun Luo and Wei-Chung Hsu and Antonia Zhai",
title = "The design and implementation of heterogeneous
multicore systems for energy-efficient speculative
thread execution",
journal = j-TACO,
volume = "10",
number = "4",
pages = "26:1--26:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541233",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the emergence of multicore processors, various
aggressive execution models have been proposed to
exploit fine-grained thread-level parallelism, taking
advantage of the fast on-chip interconnection
communication. However, the aggressive nature of these
execution models often leads to excessive energy
consumption incommensurate to execution time reduction.
In the context of Thread-Level Speculation, we
demonstrated that on a same-ISA heterogeneous multicore
system, by dynamically deciding how on-chip resources
are utilized, speculative threads can achieve
performance gain in an energy-efficient way. Through a
systematic design space exploration, we built a
multicore architecture that integrates heterogeneous
components of processing cores and first-level caches.
To cope with processor reconfiguration overheads, we
introduced runtime mechanisms to mitigate their
impacts. To match program execution with the most
energy-efficient processor configuration, the system
was equipped with a dynamic resource allocation scheme
that characterizes program behaviors using novel
processor counters. We evaluated the proposed
heterogeneous system with a diverse set of benchmark
programs from SPEC CPU2000 and CPU20006 suites.
Compared to the most efficient homogeneous TLS
implementation, we achieved similar performance but
consumed 18\% less energy. Compared to the most
efficient homogeneous uniprocessor running sequential
programs, we improved performance by 29\% and reduced
energy consumption by 3.6\%, which is a 42\%
improvement in energy-delay-squared product.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rolan:2013:VSC,
author = "Dyer Rol{\'a}n and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Virtually split cache: an efficient mechanism to
distribute instructions and data 1",
journal = j-TACO,
volume = "10",
number = "4",
pages = "27:1--27:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541234",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "First-level caches are usually split for both
instructions and data instead of unifying them in a
single cache. Although that approach eases the pipeline
design and provides a simple way to independently treat
data and instructions, its global hit rate is usually
smaller than that of a unified cache. Furthermore,
unified lower-level caches usually behave and process
memory requests disregarding whether they are data or
instruction requests. In this article, we propose a new
technique aimed to balance the amount of space devoted
to instructions and data for optimizing set-associative
caches: the Virtually Split Cache or VSC. Our technique
combines the sharing of resources from unified
approaches with the bandwidth and parallelism that
split configurations provide, thus reducing power
consumption while not degrading performance. Our design
dynamically adjusts cache resources devoted to
instructions and data depending on their particular
demand. Two VSC designs are proposed in order to track
the instructions and data requirements. The Shadow Tag
VSC (ST-VSC) is based on shadow tags that store the
last evicted line related to data and instructions in
order to determine how well the cache would work with
one more way per set devoted to each kind. The Global
Selector VSC (GS-VSC) uses a saturation counter that is
updated every time a cache miss occurs either under an
instruction or data request applying a duel-like
mechanism. Experiments with a variable and a fixed
latency VSC show that ST-VSC and GS-VSC reduce on
average the cache hierarchy power consumption by 29\%
and 24\%, respectively, with respect to a standard
baseline. As for performance, while the fixed latency
designs virtually match the split baseline in a
single-core system, a variable latency ST-VSC and
GS-VSC increase the average IPC by 2.5\% and 2\%,
respectively. In multicore systems, even the slower
fixed latency ST-VSC and GS-VSC designs improve the
baseline IPC by 3.1\% and 2.5\%, respectively, in a
four-core system thanks to the reduction in the
bandwidth demanded from the lower cache levels. This is
in contrast with many techniques that trade performance
degradation for power consumption reduction. VSC
particularly benefits embedded processors with a single
level of cache, where up to an average 9.2\% IPC
improvement is achieved. Interestingly, we also find
that partitioning the LLC for instructions and data can
improve performance around 2\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Subramaniam:2013:UFC,
author = "Samantika Subramaniam and Simon C. Steely and Will
Hasenplaugh and Aamer Jaleel and Carl Beckmann and
Tryggve Fossum and Joel Emer",
title = "Using in-flight chains to build a scalable cache
coherence protocol",
journal = j-TACO,
volume = "10",
number = "4",
pages = "28:1--28:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541235",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As microprocessor designs integrate more cores,
scalability of cache coherence protocols becomes a
challenging problem. Most directory-based protocols
avoid races by using blocking tag directories that can
impact the performance of parallel applications. In
this article, we first quantitatively demonstrate that
state-of-the-art blocking protocols significantly
constrain throughput at large core counts for several
parallel applications. Nonblocking protocols address
this throughput concern at the expense of scalability
in the interconnection network or in the required
resource overheads. To address this concern, we enhance
nonblocking directory protocols by migrating the point
of service of responses. Our approach uses in-flight
chains of cores making parallel memory requests to
incorporate scalability while maintaining
high-throughput. The proposed cache coherence protocol
called chained cache coherence, can outperform blocking
protocols by up to 20\% on scientific and 12\% on
commercial applications. It also has low resource
overheads and simple address ordering requirements
making it both a high-performance and scalable
protocol. Furthermore, in-flight chains provide a
scalable solution to building hierarchical and
nonblocking tag directories as well as optimize
communication latencies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sanchez:2013:MIP,
author = "Daniel S{\'a}nchez and Yiannakis Sazeides and Juan M.
Cebri{\'a}n and Jos{\'e} M. Garc{\'\i}a and Juan L.
Arag{\'o}n",
title = "Modeling the impact of permanent faults in caches",
journal = j-TACO,
volume = "10",
number = "4",
pages = "29:1--29:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541236",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The traditional performance cost benefits we have
enjoyed for decades from technology scaling are
challenged by several critical constraints including
reliability. Increases in static and dynamic variations
are leading to higher probability of parametric and
wear-out failures and are elevating reliability into a
prime design constraint. In particular, SRAM cells used
to build caches that dominate the processor area are
usually minimum sized and more prone to failure. It is
therefore of paramount importance to develop effective
methodologies that facilitate the exploration of
reliability techniques for caches. To this end, we
present an analytical model that can determine for a
given cache configuration, address trace, and random
probability of permanent cell failure the exact
expected miss rate and its standard deviation when
blocks with faulty bits are disabled. What
distinguishes our model is that it is fully analytical,
it avoids the use of fault maps, and yet, it is both
exact and simpler than previous approaches. The
analytical model is used to produce the miss-rate
trends ( expected miss-rate ) for future technology
nodes for both uncorrelated and clustered faults. Some
of the key findings based on the proposed model are (i)
block disabling has a negligible impact on the expected
miss-rate unless probability of failure is equal or
greater than 2.6e-4, (ii) the fault map methodology can
accurately calculate the expected miss-rate as long as
1,000 to 10,000 fault maps are used, and (iii) the
expected miss-rate for execution of parallel
applications increases with the number of threads and
is more pronounced for a given probability of failure
as compared to sequential execution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2013:APF,
author = "Sanghoon Lee and James Tuck",
title = "Automatic parallelization of fine-grained
metafunctions on a chip multiprocessor",
journal = j-TACO,
volume = "10",
number = "4",
pages = "30:1--30:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541237",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Due to the importance of reliability and security,
prior studies have proposed inlining metafunctions into
applications for detecting bugs and security
vulnerabilities. However, because these software
techniques add frequent, fine-grained instrumentation
to programs, they often incur large runtime overheads.
In this work, we consider an automatic thread
extraction technique for removing these fine-grained
checks from a main application and scheduling them on
helper threads. In this way, we can leverage the
resources available on a CMP to reduce the latency and
overhead of fine-grained checking codes. Our
parallelization strategy extracts metafunctions from a
single threaded application and executes them in
customized helper threads-threads constructed to mirror
relevant fragments of the main program's behavior in
order to keep communication and overhead low. To get
good performance, we consider optimizations that reduce
communication and balance work among many threads. We
evaluate our parallelization strategy on Mudflap, a
pointer-use checking tool in GCC. To show the benefits
of our technique, we compare it to a manually
parallelized version of Mudflap. We run our experiments
on an architectural simulator with support for fast
queueing operations. On a subset of SPECint 2000, our
automatically parallelized code using static load
balance is only 19\% slower, on average, than the
manually parallelized version on a simulated eight-core
system. In addition, our automatically parallelized
code using dynamic load balance is competitive, on
average, to the manually parallelized version on a
simulated eight-core system. Furthermore, all the
applications except parser achieve better speedups with
our automatic algorithms than with the manual approach.
Also, our approach introduces very little overhead in
the main program-it is kept under 100\%, which is more
than a 5.3$ \times $ reduction compared to serial
Mudflap.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dubach:2013:DMA,
author = "Christophe Dubach and Timothy M. Jones and Edwin V.
Bonilla",
title = "Dynamic microarchitectural adaptation using machine
learning",
journal = j-TACO,
volume = "10",
number = "4",
pages = "31:1--31:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541238",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Adaptive microarchitectures are a promising solution
for designing high-performance, power-efficient
microprocessors. They offer the ability to tailor
computational resources to the specific requirements of
different programs or program phases. They have the
potential to adapt the hardware cost-effectively at
runtime to any application's needs. However, one of the
key challenges is how to dynamically determine the best
architecture configuration at any given time, for any
new workload. This article proposes a novel control
mechanism based on a predictive model for
microarchitectural adaptivity control. This model is
able to efficiently control adaptivity by monitoring
the behaviour of an application's different phases at
runtime. We show that by using this model on SPEC 2000,
we double the energy\slash performance efficiency of
the processor when compared to the best static
configuration tuned for the whole benchmark suite. This
represents 74\% of the improvement available if we know
the best microarchitecture for each program phase ahead
of time. In addition, we present an extended analysis
of the best configurations found and show that the
overheads associated with the implementation of our
scheme have a negligible impact on performance and
power.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:CME,
author = "Long Chen and Yanan Cao and Zhao Zhang",
title = "{E$^3$CC}: a memory error protection scheme with novel
address mapping for subranked and low-power memories",
journal = j-TACO,
volume = "10",
number = "4",
pages = "32:1--32:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541239",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This study presents and evaluates E$^3$ CC (Enhanced
Embedded ECC), a full design and implementation of a
generic embedded ECC scheme that enables
power-efficient error protection for subranked memory
systems. It incorporates a novel address mapping scheme
called Biased Chinese Remainder Mapping (BCRM) to
resolve the address mapping issue for memories of page
interleaving, plus a simple and effective cache design
to reduce extra ECC traffic. Our evaluation using SPEC
CPU2006 benchmarks confirms the performance and power
efficiency of the E$^3$ CC scheme for subranked
memories as well as conventional memories.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tian:2013:TBM,
author = "Yingying Tian and Samira M. Khan and Daniel A.
Jim{\'e}nez",
title = "Temporal-based multilevel correlating inclusive cache
replacement",
journal = j-TACO,
volume = "10",
number = "4",
pages = "33:1--33:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555290",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Inclusive caches have been widely used in Chip
Multiprocessors (CMPs) to simplify cache coherence.
However, they have poor performance compared with
noninclusive caches not only because of the limited
capacity of the entire cache hierarchy but also due to
ignorance of temporal locality of the Last-Level Cache
(LLC). Blocks that are highly referenced (referred to
as hot blocks ) are always hit in higher-level caches
(e.g., L1 cache) and are rarely referenced in the LLC.
Therefore, they become replacement victims in the LLC.
Due to the inclusion property, blocks evicted from the
LLC have to also be invalidated from higher-level
caches. Invalidation of hot blocks from the entire
cache hierarchy introduces costly off-chip misses that
makes the inclusive cache perform poorly. Neither
blocks that are highly referenced in the LLC nor blocks
that are highly referenced in higher-level caches
should be the LLC replacement victims. We propose
temporal-based multilevel correlating cache replacement
for inclusive caches to evict blocks in the LLC that
are also not hot in higher-level caches using
correlated temporal information acquired from all
levels of a cache hierarchy with minimal overhead.
Invalidation of these blocks does not hurt the
performance. By contrast, replacing them as early as
possible with useful blocks helps improve cache
performance. Based on our experiments, in a dual-core
CMP, an inclusive cache with temporal-based multilevel
correlating cache replacement significantly outperforms
an inclusive cache with traditional LRU replacement by
yielding an average speedup of 12.7\%, which is
comparable to an enhanced noninclusive cache, while
requiring less than 1\% of storage overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2013:HSA,
author = "Qixiao Liu and Miquel Moreto and Victor Jimenez and
Jaume Abella and Francisco J. Cazorla and Mateo
Valero",
title = "Hardware support for accurate per-task energy metering
in multicore systems",
journal = j-TACO,
volume = "10",
number = "4",
pages = "34:1--34:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555291",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Accurately determining the energy consumed by each
task in a system will become of prominent importance in
future multicore-based systems because it offers
several benefits, including (i) better application
energy/performance optimizations, (ii) improved
energy-aware task scheduling, and (iii) energy-aware
billing in data centers. Unfortunately, existing
methods for energy metering in multicores fail to
provide accurate energy estimates for each task when
several tasks run simultaneously. This article makes a
case for accurate Per-Task Energy Metering (PTEM) based
on tracking the resource utilization and occupancy of
each task. Different hardware implementations with
different trade-offs between energy prediction accuracy
and hardware-implementation complexity are proposed.
Our evaluation shows that the energy consumed in a
multicore by each task can be accurately measured. For
a 32-core, 2-way, simultaneous multithreaded core
setup, PTEM reduces the average accuracy error from
more than 12\% when our hardware support is not used to
less than 4\% when it is used. The maximum observed
error for any task in the workload we used reduces from
58\% down to 9\% when our hardware support is used.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mehta:2013:TSS,
author = "Sanyam Mehta and Gautham Beeraka and Pen-Chung Yew",
title = "Tile size selection revisited",
journal = j-TACO,
volume = "10",
number = "4",
pages = "35:1--35:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555292",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Loop tiling is a widely used loop transformation to
enhance data locality and allow data reuse. In the
tiled code, however, tiles of different sizes can lead
to significant variation in performance. Thus,
selection of an optimal tile size is critical to
performance of tiled codes. In the past, tile size
selection has been attempted using both static
analytical and dynamic empirical (auto-tuning) models.
Past work using static models assumed a direct-mapped
cache for the purpose of analysis and thus proved to be
less robust. On the other hand, the auto-tuning models
involve an exhaustive search in a large space of tiled
codes. In this article, we propose a new analytical
model for tile size selection that leverages the high
set associativity in modern caches to minimize conflict
misses. Our tile size selection model targets data
reuse in multiple levels of cache. In addition, it
considers the interaction of tiling with the SIMD unit
in modern processors in estimating the optimal tile
size. We find that these factors, not considered in
previous models, are critical in developing a robust
model for tile size selection. We implement our tile
size selection model in a polyhedral compiler and test
it on 12 benchmark kernels using two different problem
sizes. Our model outperforms the previous analytical
models that are based on reusing data in a single level
of cache and achieves an average performance
improvement of 9.7\% and 20.4\%, respectively, over the
best square (cubic) tiles for the two problem sizes. In
addition, the tile size chosen by our tile size
selection algorithm is similar to the best performing
size obtained through an extensive search, validating
the analytical model underlying the algorithm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Prisacari:2013:FPS,
author = "Bogdan Prisacari and German Rodriguez and Cyriel
Minkenberg and Torsten Hoefler",
title = "Fast pattern-specific routing for fat tree networks",
journal = j-TACO,
volume = "10",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555293",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the context of eXtended Generalized Fat Tree (XGFT)
topologies, widely used in HPC and datacenter network
designs, we propose a generic method, based on Integer
Linear Programming (ILP), to efficiently determine
optimal routes for arbitrary workloads. We propose a
novel approach that combines ILP with dynamic
programming, effectively reducing the time to solution.
Specifically, we divide the network into smaller
subdomains optimized using a custom ILP formulation
that ensures global optimality of local solutions.
Local solutions are then combined into an optimal
global solution using dynamic programming. Finally, we
demonstrate through a series of extensive benchmarks
that our approach scales in practice to networks
interconnecting several thousands of nodes, using a
single-threaded, freely available linear programming
solver on commodity hardware, with the potential for
higher scalability by means of commercial, parallel
solvers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Breughe:2013:SRB,
author = "Maximilien B. Breughe and Lieven Eeckhout",
title = "Selecting representative benchmark inputs for
exploring microprocessor design spaces",
journal = j-TACO,
volume = "10",
number = "4",
pages = "37:1--37:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555294",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The design process of a microprocessor requires
representative workloads to steer the search process
toward an optimum design point for the target
application domain. However, considering a broad set of
workloads to cover the large space of potential
workloads is infeasible given how time-consuming design
space exploration typically is. Hence, it is crucial to
select a small yet representative set of workloads,
which leads to a shorter design cycle while yielding a
(near) optimal design. Prior work has mostly looked
into selecting representative benchmarks; however,
limited attention was given to the selection of
benchmark inputs and how this affects workload
representativeness during design space exploration.
Using a set of 1,000 inputs for a number of embedded
benchmarks and a design space with around 1,700 design
points, we find that selecting a single or three random
input(s) per benchmark potentially (in a worst-case
scenario) leads to a suboptimal design that is 56\% and
33\% off, on average, relative to the optimal design in
our design space in terms of Energy-Delay Product
(EDP). We then propose and evaluate a number of methods
for selecting representative inputs and show that we
can find the optimum design point with as few as three
inputs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kerschbaumer:2013:IFT,
author = "Christoph Kerschbaumer and Eric Hennigan and Per
Larsen and Stefan Brunthaler and Michael Franz",
title = "Information flow tracking meets just-in-time
compilation",
journal = j-TACO,
volume = "10",
number = "4",
pages = "38:1--38:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555295",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Web applications are vulnerable to cross-site
scripting attacks that enable data thefts. Information
flow tracking in web browsers can prevent communication
of sensitive data to unintended recipients and thereby
stop such data thefts. Unfortunately, existing
solutions have focused on incorporating information
flow into browsers' JavaScript interpreters, rather
than just-in-time compilers, rendering the resulting
performance noncompetitive. Few users will switch to a
safer browser if it comes at the cost of significantly
degrading web application performance. We present the
first information flow tracking JavaScript engine that
is based on a true just-in-time compiler, and that
thereby outperforms all previous interpreter-based
information flow tracking JavaScript engines by more
than a factor of two. Our JIT-based engine (i) has the
same coverage as previous interpreter- based solutions,
(ii) requires reasonable implementation effort, and
(iii) introduces new optimizations to achieve
acceptable performance. When evaluated against three
industry-standard JavaScript benchmark suites, there is
still an average slowdown of 73\% over engines that do
not support information flow, but this is now well
within the range that many users will find an
acceptable price for obtaining substantially increased
security.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nasre:2013:TSE,
author = "Rupesh Nasre",
title = "Time- and space-efficient flow-sensitive points-to
analysis",
journal = j-TACO,
volume = "10",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555296",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compilation of real-world programs often requires
hours. The term nightly build known to industrial
researchers is an artifact of long compilation times.
Our goal is to reduce the absolute analysis times for
large C codes (of the order of millions of lines).
Pointer analysis is one of the key analyses performed
during compilation. Its scalability is paramount to
achieve the efficiency of the overall compilation
process and its precision directly affects that of the
client analyses. In this work, we design a time- and
space-efficient flow-sensitive pointer analysis and
parallelize it on graphics processing units. Our
analysis proposes to use an extended bloom filter,
called multibloom, to store points-to information in an
approximate manner and develops an analysis in terms of
the operations over the multibloom. Since bloom filter
is a probabilistic data structure, we develop ways to
gain back the analysis precision. We achieve effective
parallelization by achieving memory coalescing,
reducing thread divergence, and improving load balance
across GPU warps. Compared to a state-of-the-art
sequential solution, our parallel version achieves a
7.8 $ \times $ speedup with less than 5\% precision
loss on a suite of six large programs. Using two client
transformations, we show that this loss in precision
only minimally affects a client's precision.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ruan:2013:BTB,
author = "Wenjia Ruan and Yujie Liu and Michael Spear",
title = "Boosting timestamp-based transactional memory by
exploiting hardware cycle counters",
journal = j-TACO,
volume = "10",
number = "4",
pages = "40:1--40:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555297",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Time-based transactional memories typically rely on a
shared memory counter to ensure consistency.
Unfortunately, such a counter can become a bottleneck.
In this article, we identify properties of hardware
cycle counters that allow their use in place of a
shared memory counter. We then devise algorithms that
exploit the x86 cycle counter to enable bottleneck-free
transactional memory runtime systems. We also consider
the impact of privatization safety and hardware
ordering constraints on the correctness, performance,
and generality of our algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dey:2013:RMD,
author = "Tanima Dey and Wei Wang and Jack W. Davidson and Mary
Lou Soffa",
title = "{ReSense}: Mapping dynamic workloads of colocated
multithreaded applications using resource sensitivity",
journal = j-TACO,
volume = "10",
number = "4",
pages = "41:1--41:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555298",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To utilize the full potential of modern chip
multiprocessors and obtain scalable performance
improvements, it is critical to mitigate resource
contention created by multithreaded workloads. In this
article, we describe ReSense, the first runtime system
that uses application characteristics to dynamically
map multithreaded applications from dynamic
workloads-workloads where multithreaded applications
arrive, execute, and terminate continuously in
unpredictable ways. ReSense mitigates contention for
the shared resources in the memory hierarchy by
applying a novel thread-mapping algorithm that
dynamically adjusts the mapping of threads from dynamic
workloads using a precalculated sensitivity score. The
sensitivity score quantifies an application's
sensitivity to sharing a particular memory resource and
is calculated by an efficient characterization process
that involves running the multithreaded application by
itself on the target platform. To measure ReSense's
effectiveness, sensitivity scores were determined for
21 benchmarks from PARSEC-2.1 and NPB-OMP-3.3 for the
shared resources in the memory hierarchy on four
different platforms. Using three different-sized
dynamic workloads composed of randomly selected two,
four, and eight corunning benchmarks with randomly
selected start times, ReSense was able to improve the
average response time of the three workloads by up to
27.03\%, 20.89\%, and 29.34\% and throughput by up to
19.97\%, 46.56\%, and 29.86\%, respectively, over the
native OS on real hardware. By estimating and comparing
ReSense's effectiveness with the optimal thread mapping
for two different workloads, we found that the maximum
average difference with the experimentally determined
optimal performance was 1.49\% for average response
time and 2.08\% for throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Armejach:2013:TIP,
author = "Adri{\`a} Armejach and Ruben Titos-Gil and Anurag Negi
and Osman S. Unsal and Adri{\'a}n Cristal",
title = "Techniques to improve performance in requester-wins
hardware transactional memory",
journal = j-TACO,
volume = "10",
number = "4",
pages = "42:1--42:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555299",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The simplicity of requester-wins Hardware
Transactional Memory (HTM) makes it easy to incorporate
in existing chip multiprocessors. Hence, such systems
are expected to be widely available in the near future.
Unfortunately, these implementations are prone to
suffer severe performance degradation due to transient
and persistent livelock conditions. This article shows
that existing techniques are unable to mitigate this
degradation effectively. It then proposes and evaluates
four novel techniques-two software-based that employ
information provided by the hardware and two that
require simple core-local hardware additions-which have
the potential to boost the performance of
requester-wins HTM designs substantially.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jeon:2013:RDR,
author = "Myeongjae Jeon and Conglong Li and Alan L. Cox and
Scott Rixner",
title = "Reducing {DRAM} row activations with eager read\slash
write clustering",
journal = j-TACO,
volume = "10",
number = "4",
pages = "43:1--43:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555300",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes and evaluates a new approach to
optimizing DRAM performance and energy consumption that
is based on eagerly writing dirty cache lines to DRAM.
Under this approach, many dirty cache lines are written
to DRAM before they are evicted. In particular, dirty
cache lines that have not been recently accessed are
eagerly written to DRAM when the corresponding row has
been activated by an ordinary, noneager access, such as
a read. This approach enables clustering of reads and
writes that target the same row, resulting in a
significant reduction in row activations. Specifically,
for a variety of applications, it reduces the number of
DRAM row activations by an average of 42\% and a
maximum of 82\%. Moreover, the results from a
full-system simulator show compelling performance
improvements and energy consumption reductions. Out of
23 applications, 6 have overall performance
improvements between 10\% and 20\%, and 3 have
improvements in excess of 20\%. Furthermore, 12 consume
between 10\% and 20\% less DRAM energy, and 7 have
energy consumption reductions in excess of 20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2013:HPP,
author = "Zhijia Zhao and Michael Bebenita and Dave Herman and
Jianhua Sun and Xipeng Shen",
title = "{HPar}: a practical parallel parser for {HTML} ---
taming {HTML} complexities for parallel parsing",
journal = j-TACO,
volume = "10",
number = "4",
pages = "44:1--44:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555301",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallelizing HTML parsing is challenging due to the
complexities of HTML documents and the inherent
dependencies in its parsing algorithm. As a result,
despite numerous studies in parallel parsing, HTML
parsing remains sequential today. It forms one of the
final barriers for fully parallelizing browser
operations to minimize the browser's response time-an
important variable for user experiences, especially on
portable devices. This article provides a comprehensive
analysis on the special complexities of parallel HTML
parsing and presents a systematic exploration in
overcoming those difficulties through specially
designed speculative parallelizations. This work
develops, to the best of our knowledge, the first
pipelining and data-level parallel HTML parsers. The
data-level parallel parser, named HPar, achieves up to
2.4$ \times $ speedup on quadcore devices. This work
demonstrates the feasibility of efficient, parallel
HTML parsing for the first time and offers a set of
novel insights for parallel HTML parsing",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Totoni:2013:EFE,
author = "Ehsan Totoni and Mert Dikmen and Mar{\'\i}a Jes{\'u}s
Garzar{\'a}n",
title = "Easy, fast, and energy-efficient object detection on
heterogeneous on-chip architectures",
journal = j-TACO,
volume = "10",
number = "4",
pages = "45:1--45:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555302",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We optimize a visual object detection application
(that uses Vision Video Library kernels) and show that
OpenCL is a unified programming paradigm that can
provide high performance when running on the Ivy Bridge
heterogeneous on-chip architecture. We evaluate
different mapping techniques and show that running each
kernel where it fits the best and using software
pipelining can provide 1.91 times higher performance
and 42\% better energy efficiency. We also show how to
trade accuracy for energy at runtime. Overall, our
application can perform accurate object detection at 40
frames per second (fps) in an energy-efficient
manner.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fedorov:2013:AAL,
author = "Viacheslav V. Fedorov and Sheng Qiu and A. L.
Narasimha Reddy and Paul V. Gratz",
title = "{ARI}: Adaptive {LLC}-memory traffic management",
journal = j-TACO,
volume = "10",
number = "4",
pages = "46:1--46:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2543697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Decreasing the traffic from the CPU LLC to main memory
is a very important issue in modern systems. Recent
work focuses on cache misses, overlooking the impact of
writebacks on the total memory traffic, energy
consumption, IPC, and so forth. Policies that foster a
balanced approach, between reducing write traffic to
memory and improving miss rates, can increase overall
performance and improve energy efficiency and memory
system lifetime for NVM memory technology, such as
phase-change memory (PCM). We propose Adaptive
Replacement and Insertion (ARI), an adaptive approach
to last-level CPU cache management, optimizing the two
parameters (miss rate and writeback rate)
simultaneously. Our specific focus is to reduce
writebacks as much as possible while maintaining or
improving the miss rate relative to conventional LRU
replacement policy. ARI reduces LLC writebacks by 33\%,
on average, while also decreasing misses by 4.7\%, on
average. In a typical system, this boosts IPC by 4.9\%,
on average, while decreasing energy consumption by
8.9\%. These results are achieved with minimal hardware
overheads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gonzalez-Alvarez:2013:AAD,
author = "Cecilia Gonz{\'a}lez-{\'A}lvarez and Jennifer B.
Sartor and Carlos {\'A}lvarez and Daniel
Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout",
title = "Accelerating an application domain with specialized
functional units",
journal = j-TACO,
volume = "10",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555303",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware specialization has received renewed interest
recently as chips are hitting power limits. Chip
designers of traditional processor architectures have
primarily focused on general-purpose computing,
partially due to time-to-market pressure and simpler
design processes. But new power limits require some
chip specialization. Although hardware configured for a
specific application yields large speedups for
low-power dissipation, its design is more complex and
less reusable. We instead explore domain-based
specialization, a scalable approach that balances
hardware's reusability and performance efficiency. We
focus on specialization using customized compute units
that accelerate particular operations. In this article,
we develop automatic techniques to identify code
sequences from different applications within a domain
that can be targeted to a new custom instruction that
will be run inside a configurable specialized
functional unit (SFU). We demonstrate that using a
canonical representation of computations finds more
common code sequences among applications that can be
mapped to the same custom instruction, leading to
larger speedups while specializing a smaller core area
than previous pattern-matching techniques. We also
propose new heuristics to narrow the search space of
domain-specific custom instructions, finding those that
achieve the best performance across applications. We
estimate the overall performance achieved with our
automatic techniques using hardware models on a set of
nine media benchmarks, showing that when limiting the
core area devoted to specialization, the SFU
customization with the largest speedups includes both
application- and domain-specific custom instructions.
We demonstrate that exploring domain-specific hardware
acceleration is key to continued computing system
performance improvements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:RMM,
author = "Xiaolin Wang and Lingmei Weng and Zhenlin Wang and
Yingwei Luo",
title = "Revisiting memory management on virtualized
environments",
journal = j-TACO,
volume = "10",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555304",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the evolvement of hardware, 64-bit Central
Processing Units (CPUs) and 64-bit Operating Systems
(OSs) have dominated the market. This article
investigates the performance of virtual memory
management of Virtual Machines (VMs) with a large
virtual address space in 64-bit OSs, which imposes
different pressure on memory virtualization than 32-bit
systems. Each of the two conventional memory
virtualization approaches, Shadowing Paging (SP) and
Hardware-Assisted Paging (HAP), causes different
overhead for different applications. Our experiments
show that 64-bit applications prefer to run in a VM
using SP, while 32-bit applications do not have a
uniform preference between SP and HAP. In this article,
we trace this inconsistency between 32-bit applications
and 64-bit applications to its root cause through a
systematic empirical study in Linux systems and
discover that the major overhead of SP results from
memory management in the 32-bit GNU C library ( glibc
). We propose enhancements to the existing memory
management algorithms, which substantially reduce the
overhead of SP. Based on the evaluations using SPEC
CPU2006, Parsec 2.1, and cloud benchmarks, our results
show that SP, with the improved memory allocators, can
compete with HAP in almost all cases, in both 64-bit
and 32-bit systems. We conclude that without a
significant breakthrough in HAP, researchers should pay
more attention to SP, which is more flexible and cost
effective.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2013:PAP,
author = "Chuntao Jiang and Zhibin Yu and Hai Jin and Chengzhong
Xu and Lieven Eeckhout and Wim Heirman and Trevor E.
Carlson and Xiaofei Liao",
title = "{PCantorSim}: Accelerating parallel architecture
simulation through fractal-based sampling",
journal = j-TACO,
volume = "10",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555305",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Computer architects rely heavily on microarchitecture
simulation to evaluate design alternatives.
Unfortunately, cycle-accurate simulation is extremely
slow, being at least 4 to 6 orders of magnitude slower
than real hardware. This longstanding problem is
further exacerbated in the multi-/many-core era,
because single-threaded simulation performance has not
improved much, while the design space has expanded
substantially. Parallel simulation is a promising
approach, yet does not completely solve the simulation
challenge. Furthermore, existing sampling techniques,
which are widely used for single-threaded applications,
do not readily apply to multithreaded applications as
thread interaction and synchronization must now be
taken into account. This work presents PCantorSim, a
novel Cantor set (a classic fractal)--based sampling
scheme to accelerate parallel simulation of
multithreaded applications. Through the use of the
proposed methodology, only less than 5\% of an
application's execution time is simulated in detail. We
have implemented our approach in Sniper (a parallel
multicore simulator) and evaluated it by running the
PARSEC benchmarks on a simulated 8-core system. The
results show that PCantorSim increases simulation speed
over detailed parallel simulation by a factor of 20$
\times $, on average, with an average absolute
execution time prediction error of 5.3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stipic:2013:PGT,
author = "Srdan Stipi{\'c} and Vesna Smiljkovi{\'c} and Osman
Unsal and Adri{\'a}n Cristal and Mateo Valero",
title = "Profile-guided transaction coalescing-lowering
transactional overheads by merging transactions",
journal = j-TACO,
volume = "10",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555306",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Previous studies in software transactional memory
mostly focused on reducing the overhead of
transactional read and write operations. In this
article, we introduce transaction coalescing, a
profile-guided compiler optimization technique that
attempts to reduce the overheads of starting and
committing a transaction by merging two or more small
transactions into one large transaction. We develop a
profiling tool and a transaction coalescing heuristic
to identify candidate transactions suitable for
coalescing. We implement a compiler extension to
automatically merge the candidate transactions at the
compile time. We evaluate the effectiveness of our
technique using the hash table micro-benchmark and the
STAMP benchmark suite. Transaction coalescing improves
the performance of the hash table significantly and the
performance of Vacation and SSCA2 benchmarks by 19.4\%
and 36.4\%, respectively, when running with 12
threads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:WWA,
author = "Zhe Wang and Shuchang Shan and Ting Cao and Junli Gu
and Yi Xu and Shuai Mu and Yuan Xie and Daniel A.
Jim{\'e}nez",
title = "{WADE}: Writeback-aware dynamic cache management for
{NVM}-based main memory system",
journal = j-TACO,
volume = "10",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555307",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging Non-Volatile Memory (NVM) technologies are
explored as potential alternatives to traditional
SRAM/DRAM-based memory architecture in future
microprocessor design. One of the major disadvantages
for NVM is the latency and energy overhead associated
with write operations. Mitigation techniques to
minimize the write overhead for NVM-based main memory
architecture have been studied extensively. However,
most prior work focuses on optimization techniques for
NVM-based main memory itself, with little attention
paid to cache management policies for the Last-Level
Cache (LLC). In this article, we propose a
Writeback-Aware Dynamic CachE (WADE) management
technique to help mitigate the write overhead in
NVM-based memory. The proposal is based on the
observation that, when dirty cache blocks are evicted
from the LLC and written into NVM-based memory (with
PCM as an example), the long latency and high energy
associated with write operations to NVM-based memory
can cause system performance/power degradation. Thus,
reducing the number of writeback requests from the LLC
is critical. The proposed WADE cache management
technique tries to keep highly reused dirty cache
blocks in the LLC. The technique predicts blocks that
are frequently written back in the LLC. The LLC sets
are dynamically partitioned into a frequent writeback
list and a nonfrequent writeback list. It keeps a best
size of each list in the LLC. Our evaluation shows that
the technique can reduce the number of writeback
requests by 16.5\% for memory-intensive single-threaded
benchmarks and 10.8\% for multicore workloads. It
yields a geometric mean speedup of 5.1\% for
single-thread applications and 7.6\% for multicore
workloads. Due to the reduced number of writeback
requests to main memory, the technique reduces the
energy consumption by 8.1\% for single-thread
applications and 7.6\% for multicore workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2013:CCC,
author = "Yong Li and Yaojun Zhang and Hai LI and Yiran Chen and
Alex K. Jones",
title = "{C1C}: a configurable, compiler-guided {STT-RAM L1}
cache",
journal = j-TACO,
volume = "10",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555308",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Spin-Transfer Torque RAM (STT-RAM), a promising
alternative to SRAM for reducing leakage power
consumption, has been widely studied to mitigate the
impact of its asymmetrically long write latency.
Recently, STT-RAM has been proposed for L1 caches by
relaxing the data retention time to improve write
performance and dynamic energy. However, as the
technology scales down from 65nm to 22nm, the
performance of the read operation scales poorly due to
reduced sense margins and sense amplifier delays. In
this article, we leverage a dual-mode STT memory cell
to design a configurable L1 cache architecture termed
C1C to mitigate read performance barriers with
technology scaling. Guided by application access
characteristics discovered through novel compiler
analyses, the proposed cache adaptively switches
between a high performance and a low-power access mode.
Our evaluation demonstrates that the proposed cache
with compiler guidance outperforms a state-of-the-art
STT-RAM cache design by 9\% with high dynamic energy
efficiency, leading to significant performance/watt
improvements over several competing approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fauzia:2013:BRD,
author = "Naznin Fauzia and Venmugil Elango and Mahesh
Ravishankar and J. Ramanujam and Fabrice Rastello and
Atanas Rountev and Louis-No{\"e}l Pouchet and P.
Sadayappan",
title = "Beyond reuse distance analysis: Dynamic analysis for
characterization of data locality potential",
journal = j-TACO,
volume = "10",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555309",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging computer architectures will feature
drastically decreased flops/byte (ratio of peak
processing rate to memory bandwidth) as highlighted by
recent studies on Exascale architectural trends.
Further, flops are getting cheaper, while the energy
cost of data movement is increasingly dominant. The
understanding and characterization of data locality
properties of computations is critical in order to
guide efforts to enhance data locality. Reuse distance
analysis of memory address traces is a valuable tool to
perform data locality characterization of programs. A
single reuse distance analysis can be used to estimate
the number of cache misses in a fully associative LRU
cache of any size, thereby providing estimates on the
minimum bandwidth requirements at different levels of
the memory hierarchy to avoid being bandwidth bound.
However, such an analysis only holds for the particular
execution order that produced the trace. It cannot
estimate potential improvement in data locality through
dependence-preserving transformations that change the
execution schedule of the operations in the
computation. In this article, we develop a novel
dynamic analysis approach to characterize the inherent
locality properties of a computation and thereby assess
the potential for data locality enhancement via
dependence-preserving transformations. The execution
trace of a code is analyzed to extract a
Computational-Directed Acyclic Graph (CDAG) of the data
dependences. The CDAG is then partitioned into convex
subsets, and the convex partitioning is used to reorder
the operations in the execution trace to enhance data
locality. The approach enables us to go beyond reuse
distance analysis of a single specific order of
execution of the operations of a computation in
characterization of its data locality properties. It
can serve a valuable role in identifying promising code
regions for manual transformation, as well as assessing
the effectiveness of compiler transformations for data
locality enhancement. We demonstrate the effectiveness
of the approach using a number of benchmarks, including
case studies where the potential shown by the analysis
is exploited to achieve lower data movement costs and
better performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bardizbanyan:2013:DPD,
author = "Alen Bardizbanyan and Magnus Sj{\"a}lander and David
Whalley and Per Larsson-Edefors",
title = "Designing a practical data filter cache to improve
both energy efficiency and performance",
journal = j-TACO,
volume = "10",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555310",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Conventional Data Filter Cache (DFC) designs improve
processor energy efficiency, but degrade performance.
Furthermore, the single-cycle line transfer suggested
in prior studies adversely affects Level-1 Data Cache
(L1 DC) area and energy efficiency. We propose a
practical DFC that is accessed early in the pipeline
and transfers a line over multiple cycles. Our DFC
design improves performance and eliminates a
substantial fraction of L1 DC accesses for loads, L1 DC
tag checks on stores, and data translation lookaside
buffer accesses for both loads and stores. Our
evaluation shows that the proposed DFC can reduce the
data access energy by 42.5\% and improve execution time
by 4.2\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hagiescu:2013:GCG,
author = "Andrei Hagiescu and Bing Liu and R. Ramanathan and
Sucheendra K. Palaniappan and Zheng Cui and Bipasa
Chattopadhyay and P. S. Thiagarajan and Weng-Fai Wong",
title = "{GPU} code generation for {ODE}-based applications
with phased shared-data access patterns",
journal = j-TACO,
volume = "10",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555311",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present a novel code generation scheme for GPUs.
Its key feature is the platform-aware generation of a
heterogeneous pool of threads. This exposes more
data-sharing opportunities among the concurrent threads
and reduces the memory requirements that would
otherwise exceed the capacity of the on-chip memory.
Instead of the conventional strategy of focusing on
exposing as much parallelism as possible, our scheme
leverages on the phased nature of memory access
patterns found in many applications that exhibit
massive parallelism. We demonstrate the effectiveness
of our code generation strategy on a computational
systems biology application. This application consists
of computing a Dynamic Bayesian Network (DBN)
approximation of the dynamics of signalling pathways
described as a system of Ordinary Differential
Equations (ODEs). The approximation algorithm involves
(i) sampling many (of the order of a few million) times
from the set of initial states, (ii) generating
trajectories through numerical integration, and (iii)
storing the statistical properties of this set of
trajectories in Conditional Probability Tables (CPTs)
of a DBN via a prespecified discretization of the time
and value domains. The trajectories can be computed in
parallel. However, the intermediate data needed for
computing them, as well as the entries for the CPTs,
are too large to be stored locally. Our experiments
show that the proposed code generation scheme scales
well, achieving significant performance improvements on
three realistic signalling pathways models. These
results suggest how our scheme could be extended to
deal with other applications involving systems of
ODEs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2013:TLS,
author = "Junghee Lee and Chrysostomos Nicopoulos and Hyung Gyu
Lee and Jongman Kim",
title = "{TornadoNoC}: a lightweight and scalable on-chip
network architecture for the many-core era",
journal = j-TACO,
volume = "10",
number = "4",
pages = "56:1--56:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555312",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The rapid emergence of Chip Multi-Processors (CMP) as
the de facto microprocessor archetype has highlighted
the importance of scalable and efficient on-chip
networks. Packet-based Networks-on-Chip (NoC) are
gradually cementing themselves as the medium of choice
for the multi-/many-core systems of the near future,
due to their innate scalability. However, the
prominence of the debilitating power wall requires the
NoC to also be as energy efficient as possible. To
achieve these two antipodal requirements-scalability
and energy efficiency-we propose TornadoNoC, an
interconnect architecture that employs a novel flow
control mechanism. To prevent livelocks and deadlocks,
a sequence numbering scheme and a dynamic ring
inflation technique are proposed, and their correctness
formally proven. The primary objective of TornadoNoC is
to achieve substantial gains in (a) scalability to
many-core systems and (b) the area/power footprint, as
compared to current state-of-the-art router
implementations. The new router is demonstrated to
provide better scalability to hundreds of cores than an
ideal single-cycle wormhole implementation and other
scalability-enhanced low-cost routers. Extensive
simulations using both synthetic traffic patterns and
real applications running in a full-system simulator
corroborate the efficacy of the proposed design.
Finally, hardware synthesis analysis using commercial
65nm standard-cell libraries indicates that the area
and power budgets of the new router are reduced by up
to 53\% and 58\%, respectively, as compared to existing
state-of-the-art low-cost routers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Strydis:2013:SAP,
author = "Christos Strydis and Robert M. Seepers and Pedro
Peris-Lopez and Dimitrios Siskos and Ioannis Sourdis",
title = "A system architecture, processor, and communication
protocol for secure implants",
journal = j-TACO,
volume = "10",
number = "4",
pages = "57:1--57:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555313",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Secure and energy-efficient communication between
Implantable Medical Devices (IMDs) and authorized
external users is attracting increasing attention these
days. However, there currently exists no systematic
approach to the problem, while solutions from
neighboring fields, such as wireless sensor networks,
are not directly transferable due to the peculiarities
of the IMD domain. This work describes an original,
efficient solution for secure IMD communication. A new
implant system architecture is proposed, where security
and main-implant functionality are made completely
decoupled by running the tasks onto two separate cores.
Wireless communication goes through a custom security
ASIP, called SISC (Smart-Implant Security Core), which
runs an energy-efficient security protocol. The
security core is powered by RF-harvested energy until
it performs external-reader authentication, providing
an elegant defense mechanism against battery
Denial-of-Service (DoS) and other, more common attacks.
The system has been evaluated based on a realistic case
study involving an artificial pancreas implant. When
synthesized for a UMC 90nm CMOS ASIC technology, our
system architecture achieves defense against
unauthorized accesses having zero energy cost, running
entity authentication through harvesting only 7.45 $
\mu $J of RF energy from the requesting entity. In all
other successfully authenticated accesses, our
architecture achieves secure data exchange without
affecting the performance of the main IMD
functionality, adding less than 1o/oo (1.3 mJ ) to the
daily energy consumption of a typical implant. Compared
to a singe-core, secure reference IMD, which would
still be more vulnerable to some types of attacks, our
secure system on chip (SoC) achieves high security
levels at 56\% energy savings and at an area overhead
of less than 15\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2013:FMS,
author = "Wonsub Kim and Yoonseo Choi and Haewoo Park",
title = "Fast modulo scheduler utilizing patternized routes for
coarse-grained reconfigurable architectures",
journal = j-TACO,
volume = "10",
number = "4",
pages = "58:1--58:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555314",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Coarse-Grained Reconfigurable Architectures (CGRAs)
present a potential of high compute throughput with
energy efficiency. A CGRA consists of an array of
Functional Units (FUs), which communicate with each
other through an interconnect network containing
transmission nodes and register files. To achieve high
performance from the software solutions mapped onto
CGRAs, modulo scheduling of loops is generally
employed. One of the key challenges in modulo
scheduling for CGRAs is to explicitly handle routings
of operands from a source to a destination operations
through various routing resources. Existing modulo
schedulers for CGRAs are slow because finding a valid
routing is generally a searching problem over a large
space, even with the guidance of well-defined cost
metrics. Applications in traditional embedded
multimedia domains are regarded as relatively tolerant
to a slow compile time in exchange for a high-quality
solution. However, many rapidly growing domains of
applications, such as 3D graphics, require a fast
compilation. Entrances of CGRAs to these domains have
been blocked mainly due to their long compile time. We
attack this problem by utilizing patternized routes,
for which resources and time slots for a success can be
estimated in advance when a source operation is placed.
By conservatively reserving predefined resources at
predefined time slots, future routings originating from
the source operation are guaranteed. Experiments on a
real-world 3D graphics benchmark suite show that our
scheduler improves the compile time up to 6,000 times
while achieving an average 70\% throughputs of the
state-of-the-art CGRA modulo scheduler, the
Edge-centric Modulo Scheduler (EMS).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nuzman:2013:JTC,
author = "Dorit Nuzman and Revital Eres and Sergei Dyshel and
Marcel Zalmanovici and Jose Castanos",
title = "{JIT} technology with {C\slash C++}: Feedback-directed
dynamic recompilation for statically compiled
languages",
journal = j-TACO,
volume = "10",
number = "4",
pages = "59:1--59:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555315",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The growing gap between the advanced capabilities of
static compilers as reflected in benchmarking results
and the actual performance that users experience in
real-life scenarios makes client-side dynamic
optimization technologies imperative to the domain of
static languages. Dynamic optimization of software
distributed in the form of a platform-agnostic
Intermediate-Representation (IR) has been very
successful in the domain of managed languages, greatly
improving upon interpreted code, especially when online
profiling is used. However, can such feedback-directed
IR-based dynamic code generation be viable in the
domain of statically compiled, rather than interpreted,
languages? We show that fat binaries, which combine the
IR together with the statically compiled executable,
can provide a practical solution for software vendors,
allowing their software to be dynamically optimized
without the limitation of binary-level approaches,
which lack the high-level IR of the program, and
without the warm-up costs associated with the IR-only
software distribution approach. We describe and
evaluate the fat-binary-based runtime compilation
approach using SPECint2006, demonstrating that the
overheads it incurs are low enough to be successfully
surmounted by dynamic optimization. Building on Java
JIT technologies, our results already improve upon
common real-world usage scenarios, including very small
workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ramashekar:2013:ADA,
author = "Thejas Ramashekar and Uday Bondhugula",
title = "Automatic data allocation and buffer management for
multi-{GPU} machines",
journal = j-TACO,
volume = "10",
number = "4",
pages = "60:1--60:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2544100",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multi-GPU machines are being increasingly used in
high-performance computing. Each GPU in such a machine
has its own memory and does not share the address space
either with the host CPU or other GPUs. Hence,
applications utilizing multiple GPUs have to manually
allocate and manage data on each GPU. Existing works
that propose to automate data allocations for GPUs have
limitations and inefficiencies in terms of allocation
sizes, exploiting reuse, transfer costs, and
scalability. We propose a scalable and fully automatic
data allocation and buffer management scheme for affine
loop nests on multi-GPU machines. We call it the
Bounding-Box-based Memory Manager (BBMM). BBMM can
perform at runtime, during standard set operations like
union, intersection, and difference, finding subset and
superset relations on hyperrectangular regions of array
data (bounding boxes). It uses these operations along
with some compiler assistance to identify, allocate,
and manage data required by applications in terms of
disjoint bounding boxes. This allows it to (1) allocate
exactly or nearly as much data as is required by
computations running on each GPU, (2) efficiently track
buffer allocations and hence maximize data reuse across
tiles and minimize data transfer overhead, and (3) and
as a result, maximize utilization of the combined
memory on multi-GPU machines. BBMM can work with any
choice of parallelizing transformations, computation
placement, and scheduling schemes, whether static or
dynamic. Experiments run on a four-GPU machine with
various scientific programs showed that BBMM reduces
data allocations on each GPU by up to 75\% compared to
current allocation schemes, yields performance of at
least 88\% of manually written code, and allows
excellent weak scaling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vandierendonck:2013:ADT,
author = "Hans Vandierendonck and George Tzenakis and Dimitrios
S. Nikolopoulos",
title = "Analysis of dependence tracking algorithms for task
dataflow execution",
journal = j-TACO,
volume = "10",
number = "4",
pages = "61:1--61:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555316",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Processor architectures has taken a turn toward
many-core processors, which integrate multiple
processing cores on a single chip to increase overall
performance, and there are no signs that this trend
will stop in the near future. Many-core processors are
harder to program than multicore and single-core
processors due to the need for writing parallel or
concurrent programs with high degrees of parallelism.
Moreover, many-cores have to operate in a mode of
strong scaling because of memory bandwidth constraints.
In strong scaling, increasingly finer-grain parallelism
must be extracted in order to keep all processing cores
busy. Task dataflow programming models have a high
potential to simplify parallel programming because they
alleviate the programmer from identifying precisely all
intertask dependences when writing programs. Instead,
the task dataflow runtime system detects and enforces
intertask dependences during execution based on the
description of memory accessed by each task. The
runtime constructs a task dataflow graph that captures
all tasks and their dependences. Tasks are scheduled to
execute in parallel, taking into account dependences
specified in the task graph. Several papers report
important overheads for task dataflow systems, which
severely limits the scalability and usability of such
systems. In this article, we study efficient schemes to
manage task graphs and analyze their scalability. We
assume a programming model that supports input, output,
and in/out annotations on task arguments, as well as
commutative in/out and reductions. We analyze the
structure of task graphs and identify versions and
generations as key concepts for efficient management of
task graphs. Then, we present three schemes to manage
task graphs building on graph representations,
hypergraphs, and lists. We also consider a fourth
edgeless scheme that synchronizes tasks using integers.
Analysis using microbenchmarks shows that the graph
representation is not always scalable and that the
edgeless scheme introduces least overhead in nearly all
situations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jeong:2013:EET,
author = "Yeonghun Jeong and Seongseok Seo and Jongeun Lee",
title = "Evaluator-executor transformation for efficient
pipelining of loops with conditionals",
journal = j-TACO,
volume = "10",
number = "4",
pages = "62:1--62:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555317",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Control divergence poses many problems in
parallelizing loops. While predicated execution is
commonly used to convert control dependence into data
dependence, it often incurs high overhead because it
allocates resources equally for both branches of a
conditional statement regardless of their execution
frequencies. For those loops with unbalanced
conditionals, we propose a software transformation that
divides a loop into two or three smaller loops so that
the condition is evaluated only in the first loop,
while the less frequent branch is executed in the
second loop in a way that is much more efficient than
in the original loop. To reduce the overhead of extra
data transfer caused by the loop fission, we also
present a hardware extension for a class of
Coarse-Grained Reconfigurable Architectures (CGRAs).
Our experiments using MiBench and computer vision
benchmarks on a CGRA demonstrate that our techniques
can improve the performance of loops over predicated
execution by up to 65\% (37.5\%, on average), when the
hardware extension is enabled. Without any hardware
modification, our software-only version can improve
performance by up to 64\% (33\%, on average), while
simultaneously reducing the energy consumption of the
entire CGRA including configuration and data memory by
22\%, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Barik:2013:DNS,
author = "Rajkishore Barik and Jisheng Zhao and Vivek Sarkar",
title = "A decoupled non-{SSA} global register allocation using
bipartite liveness graphs",
journal = j-TACO,
volume = "10",
number = "4",
pages = "63:1--63:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2544101",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Register allocation is an essential optimization for
all compilers. A number of sophisticated register
allocation algorithms have been developed over the
years. The two fundamental classes of register
allocation algorithms used in modern compilers are
based on Graph Coloring (GC) and Linear Scan (LS).
However, these two algorithms have fundamental
limitations in terms of precision. For example, the key
data structure used in GC-based algorithms, the
interference graph, lacks information on the program
points at which two variables may interfere. The
LS-based algorithms make local decisions regarding
spilling, and thereby trade off global optimization for
reduced compile-time and space overheads. Recently,
researchers have proposed Static Single Assignment
(SSA)-based decoupled register allocation algorithms
that exploit the live-range split points of the SSA
representation to optimally solve the spilling problem.
However, SSA-based register allocation often requires
extra complexity in repairing register assignments
during SSA elimination and in addressing architectural
constraints such as aliasing and ABI encoding; this
extra overhead can be prohibitively expensive in
dynamic compilation contexts. This article proposes a
decoupled non-SSA--based global register allocation
algorithm for dynamic compilation. It addresses the
limitations in current algorithms by introducing a
Bipartite Liveness Graph (BLG)-based register
allocation algorithm that models the spilling phase as
an optimization problem on the BLG itself and the
assignment phase as a separate optimization problem.
Advanced register allocation optimizations such as move
coalescing, live-range splitting, and register class
handling are also performed along with the spilling and
assignment phases. In the presence of register classes,
we propose a bucket-based greedy heuristic for
assignment that strikes a balance between spill-cost
and register class constraints. We present experimental
evaluation of our BLG-based register allocation
algorithm and compare it with production-quality
register allocators in Jikes RVM and LLVM.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gavin:2013:RIF,
author = "Peter Gavin and David Whalley and Magnus
Sj{\"a}lander",
title = "Reducing instruction fetch energy in multi-issue
processors",
journal = j-TACO,
volume = "10",
number = "4",
pages = "64:1--64:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555318",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The need to minimize power while maximizing
performance has led to recent developments of powerful
superscalar designs targeted at embedded and portable
use. Instruction fetch is responsible for a significant
fraction of microprocessor power and energy, and is
therefore an attractive target for architectural power
optimization. We present novel techniques that take
advantage of guarantees so that the instruction
translation lookaside buffer, branch target buffer, and
branch prediction buffer can frequently be disabled,
reducing their energy usage, while simultaneously
reducing branch predictor contention. These techniques
require no changes to the instruction set and can
easily be integrated into most single- and
multiple-issue processors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anonymous:2013:LDR,
author = "Anonymous",
title = "List of distinguished reviewers {ACM TACO}",
journal = j-TACO,
volume = "10",
number = "4",
pages = "65:1--65:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2560216",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:44 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "65",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Goel:2014:SPR,
author = "Neeraj Goel and Anshul Kumar and Preeti Ranjan Panda",
title = "Shared-port register file architecture for low-energy
{VLIW} processors",
journal = j-TACO,
volume = "11",
number = "1",
pages = "1:1--1:32",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2533397",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose a reduced-port Register File (RF)
architecture for reducing RF energy in a VLIW
processor. With port reduction, RF ports need to be
shared among Function Units (FUs), which may lead to
access conflicts, and thus, reduced performance. Our
solution includes (i) a carefully designed RF-FU
interconnection network that permits port sharing with
minimum conflicts and without any delay/energy
overheads, and (ii) a novel scheduling and binding
algorithm that reduces the performance penalty. With
our solution, we observed as much as 83\% RF energy
savings with no more than a 10\% loss in performance
for a set of Mediabench and Mibench benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2014:IPD,
author = "Zheng Wang and Georgios Tournavitis and Bj{\"o}rn
Franke and Michael F. P. O'Boyle",
title = "Integrating profile-driven parallelism detection and
machine-learning-based mapping",
journal = j-TACO,
volume = "11",
number = "1",
pages = "2:1--2:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579561",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compiler-based auto-parallelization is a much-studied
area but has yet to find widespread application. This
is largely due to the poor identification and
exploitation of application parallelism, resulting in
disappointing performance far below that which a
skilled expert programmer could achieve. We have
identified two weaknesses in traditional parallelizing
compilers and propose a novel, integrated approach
resulting in significant performance improvements of
the generated parallel code. Using profile-driven
parallelism detection, we overcome the limitations of
static analysis, enabling the identification of more
application parallelism, and only rely on the user for
final approval. We then replace the traditional
target-specific and inflexible mapping heuristics with
a machine-learning-based prediction mechanism,
resulting in better mapping decisions while automating
adaptation to different target architectures. We have
evaluated our parallelization strategy on the NAS and
SPEC CPU2000 benchmarks and two different multicore
platforms (dual quad-core Intel Xeon SMP and
dual-socket QS20 Cell blade). We demonstrate that our
approach not only yields significant improvements when
compared with state-of-the-art parallelizing compilers
but also comes close to and sometimes exceeds the
performance of manually parallelized codes. On average,
our methodology achieves 96\% of the performance of the
hand-tuned OpenMP NAS and SPEC parallel benchmarks on
the Intel Xeon platform and gains a significant speedup
for the IBM Cell platform, demonstrating the potential
of profile-guided and machine-learning- based
parallelization for complex multicore platforms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Samadi:2014:LGU,
author = "Mehrzad Samadi and Amir Hormati and Janghaeng Lee and
Scott Mahlke",
title = "Leveraging {GPUs} using cooperative loop speculation",
journal = j-TACO,
volume = "11",
number = "1",
pages = "3:1--3:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579617",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphics processing units, or GPUs, provide TFLOPs of
additional performance potential in commodity computer
systems that frequently go unused by most applications.
Even with the emergence of languages such as CUDA and
OpenCL, programming GPUs remains a difficult challenge
for a variety of reasons, including the inherent
algorithmic characteristics and data structure choices
used by applications as well as the tedious performance
optimization cycle that is necessary to achieve high
performance. The goal of this work is to increase the
applicability of GPUs beyond CUDA/OpenCL to implicitly
data-parallel applications written in C/C++ using
speculative parallelization. To achieve this goal, we
propose Paragon: a static/dynamic compiler platform to
speculatively run possibly data-parallel portions of
sequential applications on the GPU while cooperating
with the system CPU. For such loops, Paragon utilizes
the GPU in an opportunistic way while orchestrating a
cooperative relation between the CPU and GPU to reduce
the overhead of miss-speculations. Paragon monitors the
dependencies for the loops running speculatively on the
GPU and nonspeculatively on the CPU using a lightweight
distributed conflict detection designed specifically
for GPUs, and transfers the execution to the CPU in
case a conflict is detected. Paragon resumes the
execution on the GPU after the CPU resolves the
dependency. Our experiments show that Paragon achieves
4x on average and up to 30x speedup compared to unsafe
CPU execution with four threads and 7x on average and
up to 64x speedup versus sequential execution across a
set of sequential but implicitly data-parallel
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2014:EAC,
author = "Jue Wang and Xiangyu Dong and Yuan Xie and Norman P.
Jouppi",
title = "Endurance-aware cache line management for non-volatile
caches",
journal = j-TACO,
volume = "11",
number = "1",
pages = "4:1--4:24",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579671",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Nonvolatile memories (NVMs) have the potential to
replace low-level SRAM or eDRAM on-chip caches because
NVMs save standby power and provide large cache
capacity. However, limited write endurance is a common
problem for NVM technologies, and today's cache
management might result in unbalanced cache write
traffic, causing heavily written cache blocks to fail
much earlier than others. Although wear-leveling
techniques for NVM-based main memories exist, we cannot
simply apply them to NVM-based caches. This is because
cache writes have intraset variations as well as
interset variations, while writes to main memories only
have interset variations. To solve this problem, we
propose i$^2$ WAP, a new cache management policy that
can reduce both inter- and intraset write variations.
i$^2$ WAP has two features: Swap-Shift, an enhancement
based on existing main memory wear leveling to reduce
cache interset write variations, and Probabilistic Set
Line Flush, a novel technique to reduce cache intraset
write variations. Implementing i$^2$ WAP only needs two
global counters and two global registers. In one of our
studies, i$^2$ WAP can improve the NVM cache lifetime
by 75\% on average and up to 224\%. We also validate
that i$^2$ WAP is effective in systems with different
cache configurations and workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2014:BBS,
author = "Lei Liu and Zehan Cui and Yong Li and Yungang Bao and
Mingyu Chen and Chengyong Wu",
title = "{{BPM\slash BPM+}}: Software-based dynamic memory
partitioning mechanisms for mitigating {DRAM}
bank-\slash channel-level interferences in multicore
systems",
journal = j-TACO,
volume = "11",
number = "1",
pages = "5:1--5:28",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579672",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The main memory system is a shared resource in modern
multicore machines that can result in serious
interference leading to reduced throughput and
unfairness. Many new memory scheduling mechanisms have
been proposed to address the interference problem.
However, these mechanisms usually employ relative
complex scheduling logic and need modifications to
Memory Controllers (MCs), which incur expensive
hardware design and manufacturing overheads. This
article presents a practical software approach to
effectively eliminate the interference without any
hardware modifications. The key idea is to modify the
OS memory management system and adopt a
page-coloring-based Bank-level Partitioning Mechanism
(BPM) that allocates dedicated DRAM banks to each core
(or thread). By using BPM, memory requests from
distinct programs are segregated across multiple memory
banks to promote locality/fairness and reduce
interference. We further extend BPM to BPM+ by
incorporating channel-level partitioning, on which we
demonstrate additional gain over BPM in many cases. To
achieve benefits in the presence of diverse application
memory needs and avoid performance degradation due to
resource underutilization, we propose a dynamic
mechanism upon BPM/BPM+ that assigns appropriate
bank/channel resources based on application
memory/bandwidth demands monitored through PMU
(performance-monitoring unit) and a low-overhead OS
page table scanning process. We implement BPM/BPM+ in
Linux 2.6.32.15 kernel and evaluate the technique on
four-core and eight-core real machines by running a
large amount of randomly generated multiprogrammed and
multithreaded workloads. Experimental results show that
BPM/BPM+ can improve the overall system throughput by
4.7\%/5.9\%, on average, (up to 8.6\%/9.5\%) and reduce
the unfairness by an average of 4.2\%/6.1\% (up to
15.8\%/13.9\%).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haubl:2014:TTE,
author = "Christian H{\"a}ubl and Christian Wimmer and Hanspeter
M{\"o}ssenb{\"o}ck",
title = "Trace transitioning and exception handling in a
trace-based {JIT} compiler for {Java}",
journal = j-TACO,
volume = "11",
number = "1",
pages = "6:1--6:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579673",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Trace-based Just-In-Time (JIT) compilation generates
machine code for frequently executed paths (so-called
traces) instead of whole methods. While this has
several advantages, it complicates invocation of
compiled traces as well as exception handling, so that
previous trace-based compilers limited the way in which
traces could be invoked. We present a significantly
enhanced trace-based compiler where arbitrary
transitions between interpreted and compiled traces are
possible. For that, we introduce suitable trace calling
conventions and extend exception handling to work both
within traces and across trace boundaries. Furthermore,
we use the recorded trace information for optimizations
and combine the tracing ideas with ideas from
partial-method compilation to avoid code bloat. An
extensive evaluation with the benchmark suites DaCapo
9.12 Bach and SPECjvm2008 shows that our trace-based
compiler achieves up to 59\% higher peak performance
than the method-based Java HotSpot client compiler. On
a few benchmarks, our fairly simple trace-based
compiler shows a higher peak performance than the Java
HotSpot server compiler, which is one of today's best
optimizing JIT compilers for Java.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2014:HHH,
author = "Yongbing Huang and Licheng Chen and Zehan Cui and Yuan
Ruan and Yungang Bao and Mingyu Chen and Ninghui Sun",
title = "{HMTT}: a hybrid hardware\slash software tracing
system for bridging the {DRAM} access trace's semantic
gap",
journal = j-TACO,
volume = "11",
number = "1",
pages = "7:1--7:25",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579668",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "DRAM access traces (i.e., off-chip memory references)
can be extremely valuable for the design of memory
subsystems and performance tuning of software. Hardware
snooping on the off-chip memory interface is an
effective and nonintrusive approach to monitoring and
collecting real-life DRAM accesses. However, compared
with software-based approaches, hardware snooping
approaches typically lack semantic information, such as
process/function/object identifiers, virtual addresses,
and lock contexts, that is essential to the complete
understanding of the systems and software under
investigation. In this article, we propose a hybrid
hardware/software mechanism that is able to collect
off-chip memory reference traces with semantic
information. We have designed and implemented a
prototype system called HMTT (Hybrid Memory Trace
Tool), which uses a custom-made DIMM connector to
collect off-chip memory references and a high-level
event-encoding scheme to correlate semantic information
with memory references. In addition to providing
complete, undistorted DRAM access traces, the proposed
system is also able to perform various types of
low-overhead profiling, such as object-relative
accesses and multithread lock accesses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2014:AWA,
author = "Quan Chen and Minyi Guo",
title = "Adaptive workload-aware task scheduling for
single-{ISA} asymmetric multicore architectures",
journal = j-TACO,
volume = "11",
number = "1",
pages = "8:1--8:25",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579674",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single-ISA Asymmetric Multicore (AMC) architectures
have shown high performance as well as power
efficiency. However, current parallel programming
environments do not perform well on AMC because they
are designed for symmetric multicore architectures in
which all cores provide equal performance. Their random
task scheduling policies can result in unbalanced
workloads in AMC and severely degrade the performance
of parallel applications. To balance the workloads of
parallel applications in AMC, this article proposes an
adaptive Workload-Aware Task Scheduler (WATS) that
consists of a history-based task allocator and a
preference-based task scheduler. The history-based task
allocator is based on a near-optimal, static task
allocation using the historical statistics collected
during the execution of a parallel application. The
preference-based task scheduler, which schedules tasks
based on a preference list, can dynamically adjust the
workloads in AMC if the task allocation is less optimal
due to approximation in the history-based task
allocator. Experimental results show that WATS can
improve both the performance and energy efficiency of
task-based applications, with the performance gain up
to 66.1\% compared with traditional task schedulers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Savrun-Yeniceri:2014:EHI,
author = "G{\"u}lfem Savrun-Yeni{\c{c}}eri and Wei Zhang and
Huahan Zhang and Eric Seckler and Chen Li and Stefan
Brunthaler and Per Larsen and Michael Franz",
title = "Efficient hosted interpreters on the {JVM}",
journal = j-TACO,
volume = "11",
number = "1",
pages = "9:1--9:24",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2532642",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:08:33 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2532642",
abstract = "Many guest languages are implemented using the Java
Virtual Machine (JVM) as a host environment. There are
two major implementation choices: custom compilers and
so-called hosted interpreters. Custom compilers are
complex to build but offer good performance. Hosted
interpreters are comparatively simpler to implement but
until now have suffered from poor performance.\par
We studied the performance of hosted interpreters and
identified common bottlenecks preventing their
efficient execution. First, similar to interpreters
written in C/C++, instruction dispatch is expensive on
the JVM. Second, Java's semantics require expensive
runtime exception checks that negatively affect array
performance essential to interpreters.\par
We present two optimizations targeting these
bottlenecks and show that the performance of optimized
interpreters increases dramatically: we report speedups
by a factor of up to 2.45 over the Jython interpreter,
3.57 over the Rhino interpreter, and 2.52 over the
JRuby interpreter, respectively. The resulting
performance is comparable with that of custom
compilers. Our optimizations are enabled by a few
simple annotations that require only modest
implementation effort; in return, performance increases
substantially.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nair:2014:RPD,
author = "Prashant J. Nair and Chia-Chen Chou and Moinuddin K.
Qureshi",
title = "Refresh pausing in {DRAM} memory systems",
journal = j-TACO,
volume = "11",
number = "1",
pages = "10:1--10:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579669",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:08:33 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2579669",
abstract = "Dynamic Random Access Memory (DRAM) cells rely on
periodic refresh operations to maintain data integrity.
As the capacity of DRAM memories has increased, so has
the amount of time consumed in doing refresh. Refresh
operations contend with read \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jothi:2014:TCF,
author = "Komal Jothi and Haitham Akkary",
title = "Tuning the continual flow pipeline architecture with
virtual register renaming",
journal = j-TACO,
volume = "11",
number = "1",
pages = "11:1--11:27",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579675",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Continual Flow Pipelines (CFPs) allow a processor core
to process hundreds of in-flight instructions without
increasing cycle-critical pipeline resources. When a
load misses the data cache, CFP checkpoints the
processor register state and then moves all
miss-dependent instructions into a low-complexity WB to
unblock the pipeline. Meanwhile, miss-independent
instructions execute normally and update the processor
state. When the miss data return, CFP replays the
miss-dependent instructions from the WB and then merges
the miss-dependent and miss-independent execution
results. CFP was initially proposed for cache misses to
DRAM. Later work focused on reducing the execution
overhead of CFP by avoiding the pipeline flush before
replaying miss-dependent instructions and executing
dependent and independent instructions concurrently.
The goal of these improvements was to gain performance
by applying CFP to L1 data cache misses that hit the
last level on chip cache. However, many applications or
execution phases of applications incur excessive amount
of replay and/or rollbacks to the checkpoint. This
frequently cancels benefits from CFP and reduces
performance. In this article, we improve the CFP
architecture by using a novel virtual register renaming
substrate and by tuning the replay policies to mitigate
excessive replays and rollbacks to the checkpoint. We
describe these new design optimizations and show, using
Spec 2006 benchmarks and microarchitecture performance
and power models of our design, that our Tuned-CFP
architecture improves performance and energy
consumption over previous CFP architectures by ~10\%
and ~8\%, respectively. We also demonstrate that our
proposed architecture gives better performance return
on energy per instruction compared to a conventional
superscalar as well as previous CFP architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carle:2014:PAM,
author = "Thomas Carle and Dumitru Potop-Butucaru",
title = "Predicate-aware, makespan-preserving software
pipelining of scheduling tables",
journal = j-TACO,
volume = "11",
number = "1",
pages = "12:1--12:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579676",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:08:33 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2579676",
abstract = "We propose a software pipelining technique adapted to
specific hard real-time scheduling problems. Our
technique optimizes both computation throughput and
execution cycle makespan, with makespan being
prioritary. It also takes advantage of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kritikakou:2014:SNO,
author = "Angeliki Kritikakou and Francky Catthoor and Vasilios
Kelefouras and Costas Goutis",
title = "A scalable and near-optimal representation of access
schemes for memory management",
journal = j-TACO,
volume = "11",
number = "1",
pages = "13:1--13:25",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579677",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory management searches for the resources required
to store the concurrently alive elements. The solution
quality is affected by the representation of the
element accesses: a sub-optimal representation leads to
overestimation and a non-scalable representation
increases the exploration time. We propose a
methodology to near-optimal and scalable represent
regular and irregular accesses. The representation
consists of a set of pattern entries to compactly
describe the behavior of the memory accesses and of
pattern operations to consistently combine the pattern
entries. The result is a final sequence of pattern
entries which represents the global access scheme
without unnecessary overestimation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Leather:2014:AFG,
author = "Hugh Leather and Edwin Bonilla and Michael O'Boyle",
title = "Automatic feature generation for machine
learning--based optimising compilation",
journal = j-TACO,
volume = "11",
number = "1",
pages = "14:1--14:32",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2536688",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent work has shown that machine learning can
automate and in some cases outperform handcrafted
compiler optimisations. Central to such an approach is
that machine learning techniques typically rely upon
summaries or features of the program. The quality of
these features is critical to the accuracy of the
resulting machine learned algorithm; no machine
learning method will work well with poorly chosen
features. However, due to the size and complexity of
programs, theoretically there are an infinite number of
potential features to choose from. The compiler writer
now has to expend effort in choosing the best features
from this space. This article develops a novel
mechanism to automatically find those features that
most improve the quality of the machine learned
heuristic. The feature space is described by a grammar
and is then searched with genetic programming and
predictive modelling. We apply this technique to loop
unrolling in GCC 4.3.1 and evaluate our approach on a
Pentium 6. On a benchmark suite of 57 programs, GCCs
hard-coded heuristic achieves only 3\% of the maximum
performance available, whereas a state-of-the-art
machine learning approach with hand-coded features
obtains 59\%. Our feature generation technique is able
to achieve 76\% of the maximum available speedup,
outperforming existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kluter:2014:VWL,
author = "Theo Kluter and Samuel Burri and Philip Brisk and
Edoardo Charbon and Paolo Ienne",
title = "Virtual Ways: Low-Cost Coherence for Instruction Set
Extensions with Architecturally Visible Storage",
journal = j-TACO,
volume = "11",
number = "2",
pages = "15:1--15:26",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1145/2576877",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:13:09 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Instruction set extensions (ISEs) improve the
performance and energy consumption of
application-specific processors. ISEs can use
architecturally visible storage (AVS), localized
compiler-controlled memories, to provide higher I/O
bandwidth than reading data from the processor
pipeline. AVS creates coherence and consistence
problems with the data cache. Although a hardware
coherence protocol could solve the problem, this
approach is costly for a single-processor system. As a
low-cost alternative, we introduce Virtual Ways, which
ensures coherence through a reduced form of inclusion
between the data cache and AVS. Virtual Ways achieve
higher performance and lower energy consumption than
using a hardware coherence protocol.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ren:2014:POE,
author = "Bin Ren and Todd Mytkowicz and Gagan Agrawal",
title = "A Portable Optimization Engine for Accelerating
Irregular Data-Traversal Applications on {SIMD}
Architectures",
journal = j-TACO,
volume = "11",
number = "2",
pages = "16:1--16:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632215",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Fine-grained data parallelism is increasingly common
in the form of longer vectors integrated with
mainstream processors (SSE, AVX) and various GPU
architectures. This article develops support for
exploiting such data parallelism for a class of
nonnumeric, nongraphic applications, which perform
computations while traversing many independent,
irregular data structures. We address this problem by
developing several novel techniques. First, for code
generation, we develop an intermediate language for
specifying such traversals, followed by a runtime
scheduler that maps traversals to various SIMD units.
Second, we observe that good data locality is crucial
to sustained performance from SIMD architectures,
whereas many applications that operate on irregular
data structures (e.g., trees and graphs) have poor data
locality. To address this challenge, we develop a set
of data layout optimizations that improve spatial
locality for applications that traverse many irregular
data structures. Unlike prior data layout
optimizations, our approach incorporates a notion of
both interthread and intrathread spatial reuse into
data layout. Finally, we enable performance portability
(i.e., the ability to automatically optimize
applications for different architectures) by accurately
modeling the impact of inter- and intrathread locality
on program performance. As a consequence, our model can
predict which data layout optimization to use on a wide
variety of SIMD architectures. To demonstrate the
efficacy of our approach and optimizations, we first
show how they enable up to a 12X speedup on one SIMD
architecture for a set of real-world applications. To
demonstrate that our approach enables performance
portability, we show how our model predicts the optimal
layout for applications across a diverse set of three
real-world SIMD architectures, which offers as much as
45\% speedup over a suboptimal solution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Qi:2014:VVG,
author = "Zhengwei Qi and Jianguo Yao and Chao Zhang and Miao Yu
and Zhizhou Yang and Haibing Guan",
title = "{VGRIS}: Virtualized {GPU} Resource Isolation and
Scheduling in Cloud Gaming",
journal = j-TACO,
volume = "11",
number = "2",
pages = "17:1--17:25",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1145/2632216",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:16:31 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To achieve efficient resource management on a graphics
processing unit (GPU), there is a demand to develop a
framework for scheduling virtualized resources in cloud
gaming. In this article, we propose VGRIS, a resource
management framework for virtualized GPU resource
isolation and scheduling in cloud gaming. A set of
application programming interfaces (APIs) is provided
so that a variety of scheduling algorithms can be
implemented within the framework without modifying the
framework itself. Three scheduling algorithms are
implemented by the APIs within VGRIS. Experimental
results show that VGRIS can effectively schedule GPU
resources among various workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shen:2014:RSB,
author = "Bor-Yeh Shen and Wei-Chung Hsu and Wuu Yang",
title = "A Retargetable Static Binary Translator for the {ARM}
Architecture",
journal = j-TACO,
volume = "11",
number = "2",
pages = "18:1--18:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629335",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Machines designed with new but incompatible
Instruction Set Architecture (ISA) may lack proper
applications. Binary translation can address this
incompatibility by migrating applications from one
legacy ISA to a new one, although binary translation
has problems such as code discovery for variable-length
ISA and code location issues for handling indirect
branches. Dynamic Binary Translation (DBT) has been
widely adopted for migrating applications since it
avoids those problems. Static Binary Translation (SBT)
is a less general solution and has not been actively
researched. However, SBT performs more aggressive
optimizations, which could yield more compact code and
better code quality. Applications translated by SBT can
consume less memory, processor cycles, and power than
DBT and can be started more quickly. These advantages
are even more critical for embedded systems than for
general systems. In this article, we designed and
implemented a new SBT tool, called LLBT, which
translates ARM instructions into LLVM IRs and then
retargets the LLVM IRs to various ISAs, including x86,
x86-64, ARM, and MIPS. LLBT leverages two important
functionalities from LLVM: comprehensive optimizations
and retargetability. More importantly, LLBT solves the
code discovery problem for ARM/Thumb binaries without
resorting to interpretation. LLBT also effectively
reduced the size of the address mapping table, making
SBT a viable solution for embedded systems. Our
experiments based on the EEMBC benchmark suite show
that the LLBT-generated code can run more than $ 6
\times $ and $ 2.3 \times $ faster on average than
emulation with QEMU and HQEMU, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gracia:2014:RLN,
author = "Dar{\'\i}o Su{\'a}rez Gracia and Alexandra
Ferrer{\'o}n and Luis Montesano {Del Campo} and Teresa
Monreal Arnal and V{\'\i}ctor Vi{\~n}als Y{\'u}fera",
title = "Revisiting {LP--NUCA} Energy Consumption: Cache Access
Policies and Adaptive Block Dropping",
journal = j-TACO,
volume = "11",
number = "2",
pages = "19:1--19:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632217",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache working-set adaptation is key as embedded
systems move to multiprocessor and Simultaneous
Multithreaded Architectures (SMT) because interthread
pollution harms system performance and battery life.
Light-Power NUCA (LP-NUCA) is a working-set adaptive
cache that depends on temporal-locality to save energy.
This work identifies the sources of energy waste in
LP-NUCAs: parallel access to the tag and data arrays of
the tiles and low locality phases with useless block
migration. To counteract both issues, we prove that
switching to serial access reduces energy without
harming performance and propose a machine learning
Adaptive Drop Rate (ADR) controller that minimizes the
amount of replacement and migration when locality is
low. This work demonstrates that these techniques
efficiently adapt the cache drop and access policies to
save energy. They reduce LP-NUCA consumption 22.7\% for
1SMT. With interthread cache contention in 2SMT, the
savings rise to 29\%. Versus a conventional
organization, energy--delay improves 20.8\% and 25\%
for 1- and 2SMT benchmarks, and, in 65\% of the 2SMT
mixes, gains are larger than 20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liang:2014:DCC,
author = "Zhibin Liang and Wei Zhang and Yung-Cheng Ma",
title = "Deadline-Constrained Clustered Scheduling for {VLIW}
Architectures using Power-Gated Register Files",
journal = j-TACO,
volume = "11",
number = "2",
pages = "20:1--20:26",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1145/2632218",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:18:32 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Designing energy-efficient Digital Signal Processor
(DSP) cores has become a key concern in embedded
systems development. This paper proposes an
energy-proportional computing scheme for Very Long
Instruction Word (VLIW) architectures. To make the
processor power scales with adapted parallelism, we
propose incorporating distributed Power-Gated Register
Files (PGRF) into VLIW to achieve a PGRF-VLIW
architecture. For energy efficiency, we also propose an
instruction scheduling algorithm called the
Deadline-Constrained Clustered Scheduling (DCCS)
algorithm. The algorithm clusters the data dependence
graph to reduce data transfer energy and makes optimal
use of low-powered local registers for tree-structured
data dependence graphs. The results of evaluations
conducted using the MiBench and DSPstone benchmark
suites substantiate the expected power saving and
scaling effects.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fang:2014:PPA,
author = "Shuangde Fang and Zidong Du and Yuntan Fang and
Yuanjie Huang and Yang Chen and Lieven Eeckhout and
Olivier Temam and Huawei Li and Yunji Chen and
Chengyong Wu",
title = "Performance Portability Across Heterogeneous {SoCs}
Using a Generalized Library-Based Approach",
journal = j-TACO,
volume = "11",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2608253",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Because of tight power and energy constraints,
industry is progressively shifting toward heterogeneous
system-on-chip (SoC) architectures composed of a mix of
general-purpose cores along with a number of
accelerators. However, such SoC architectures can be
very challenging to efficiently program for the vast
majority of programmers, due to numerous programming
approaches and languages. Libraries, on the other hand,
provide a simple way to let programmers take advantage
of complex architectures, which does not require
programmers to acquire new accelerator-specific or
domain-specific languages. Increasingly, library-based,
also called algorithm-centric, programming approaches
propose to generalize the usage of libraries and to
compose programs around these libraries, instead of
using libraries as mere complements. In this article,
we present a software framework for achieving
performance portability by leveraging a generalized
library-based approach. Inspired by the notion of a
component, as employed in software engineering and
HW/SW codesign, we advocate nonexpert programmers to
write simple wrapper code around existing libraries to
provide simple but necessary semantic information to
the runtime. To achieve performance portability, the
runtime employs machine learning (simulated annealing)
to select the most appropriate accelerator and its
parameters for a given algorithm. This selection
factors in the possibly complex composition of
algorithms used in the application, the communication
among the various accelerators, and the tradeoff
between different objectives (i.e., accuracy,
performance, and energy). Using a set of benchmarks run
on a real heterogeneous SoC composed of a multicore
processor and a GPU, we show that the runtime overhead
is fairly small at 5.1\% for the GPU and 6.4\% for the
multi-core. We then apply our accelerator selection
approach to a simulated SoC platform containing
multiple inexact accelerators. We show that accelerator
selection together with hardware parameter tuning
achieves an average 46.2\% energy reduction and a
speedup of 2.1$ \times $ while meeting the desired
application error target.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kaitoua:2014:HED,
author = "Abdulrahman Kaitoua and Hazem Hajj and Mazen A. R.
Saghir and Hassan Artail and Haitham Akkary and
Mariette Awad and Mageda Sharafeddine and Khaleel
Mershad",
title = "{Hadoop} Extensions for Distributed Computing on
Reconfigurable Active {SSD} Clusters",
journal = j-TACO,
volume = "11",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2608199",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:18 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we propose new extensions to Hadoop
to enable clusters of reconfigurable active solid-state
drives (RASSDs) to process streaming data from SSDs
using FPGAs. We also develop an analytical model to
estimate the performance of RASSD clusters running
under Hadoop. Using the Hadoop RASSD platform and
network simulators, we validate our design and
demonstrate its impact on performance for different
workloads taken from Stanford's Phoenix MapReduce
project. Our results show that for a hardware
acceleration factor of 20$ \times $, compute-intensive
workloads processing 153MB of data can run up to 11$
\times $ faster than a standard Hadoop cluster.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2014:PSR,
author = "Jue Wang and Xiangyu Dong and Yuan Xie",
title = "Preventing {STT-RAM} Last-Level Caches from Port
Obstruction",
journal = j-TACO,
volume = "11",
number = "3",
pages = "23:1--23:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2633046",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many new nonvolatile memory (NVM) technologies have
been heavily studied to replace the power-hungry
SRAM/DRAM-based memory hierarchy in today's computers.
Among various emerging NVM technologies, Spin-Transfer
Torque RAM (STT-RAM) has many benefits, such as fast
read latency, low leakage power, and high density,
making it a promising candidate for last-level caches
(LLCs).$^1$ However, STT-RAM write operation is
expensive. In particular, a long STT-RAM cache write
operation might obstruct other cache accesses and
result in severe performance degradation. Consequently,
how to mitigate STT-RAM write overhead is critical to
the success of STT-RAM adoption. In this article, we
propose an obstruction-aware cache management policy
called OAP. OAP monitors cache traffic, detects
LLC-obstructive processes, and differentiates the cache
accesses from different processes. Our experiment on a
four-core architecture with an 8MB STT-RAM L3 cache
shows a 14\% performance improvement and 64\% energy
reduction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gonzalez-Mesa:2014:ETM,
author = "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L.
Zapata and Oscar Plata",
title = "Effective Transactional Memory Execution Management
for Improved Concurrency",
journal = j-TACO,
volume = "11",
number = "3",
pages = "24:1--24:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2633048",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes a transactional memory
execution model intended to exploit maximum parallelism
from sequential and multithreaded programs. A program
code section is partitioned into chunks that will be
mapped onto threads and executed transactionally. These
transactions run concurrently and out of order, trying
to exploit maximum parallelism but managed by a
specific fully distributed commit control to meet data
dependencies. To accomplish correct parallel execution,
a partial precedence order relation is derived from the
program code section and/or defined by the programmer.
When a conflict between chunks is eagerly detected, the
precedence order relation is used to determine the best
policy to solve the conflict that preserves the
precedence order while maximizing concurrency. The
model defines a new transactional state called executed
but not committed. This state allows exploiting
concurrency on two levels: intrathread and interthread.
Intrathread concurrency is improved by having pending
uncommitted transactions while executing a new one in
the same thread. The new state improves interthread
concurrency because it permits out-of-order transaction
commits regarding the precedence order. Our model has
been implemented in a lightweight software
transactional memory system, TinySTM, and has been
evaluated on a set of benchmarks obtaining an important
performance improvement over the baseline TM system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kumar:2014:EPG,
author = "Rakesh Kumar and Alejandro Mart{\'\i}nez and Antonio
Gonz{\'a}lez",
title = "Efficient Power Gating of {SIMD} Accelerators Through
Dynamic Selective Devectorization in an {HW\slash SW}
Codesigned Environment",
journal = j-TACO,
volume = "11",
number = "3",
pages = "25:1--25:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629681",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Leakage energy is a growing concern in current and
future microprocessors. Functional units of
microprocessors are responsible for a major fraction of
this energy. Therefore, reducing functional unit
leakage has received much attention in recent years.
Power gating is one of the most widely used techniques
to minimize leakage energy. Power gating turns off the
functional units during the idle periods to reduce the
leakage. Therefore, the amount of leakage energy
savings is directly proportional to the idle time
duration. This article focuses on increasing the idle
interval for the higher SIMD lanes. The applications
are profiled dynamically, in a hardware/software
codesigned environment, to find the higher SIMD lanes'
usage pattern. If the higher lanes need to be turned on
for small time periods, the corresponding portion of
the code is devectorized to keep the higher lanes off.
The devectorized code is executed on the lowest SIMD
lane. Our experimental results show that the average
energy savings of the proposed mechanism are 15\%,
12\%, and 71\% greater than power gating for
SPECFP2006, Physicsbench, and Eigen benchmark suites,
respectively. Moreover, the slowdown caused by
devectorization is negligible.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carlo:2014:FAA,
author = "Stefano {Di Carlo} and Salvatore Galfano and Marco
Indaco and Paolo Prinetto and Davide Bertozzi and Piero
Olivo and Cristian Zambelli",
title = "{FLARES}: an Aging Aware Algorithm to Autonomously
Adapt the Error Correction Capability in {NAND} Flash
Memories",
journal = j-TACO,
volume = "11",
number = "3",
pages = "26:1--26:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2631919",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of solid-state storage systems, NAND
flash memories are becoming a key storage technology.
However, they suffer from serious reliability and
endurance issues during the operating lifetime that can
be handled by the use of appropriate error correction
codes (ECCs) in order to reconstruct the information
when needed. Adaptable ECCs may provide the flexibility
to avoid worst-case reliability design, thus leading to
improved performance. However, a way to control such
adaptable ECCs' strength is required. This article
proposes FLARES, an algorithm able to adapt the ECC
correction capability of each page of a flash based on
a flash RBER prediction model and on a measurement of
the number of errors detected in a given time window.
FLARES has been fully implemented within the YAFFS 2
filesystem under the Linux operating system. This
allowed us to perform an extensive set of simulations
on a set of standard benchmarks that highlighted the
benefit of FLARES on the overall storage subsystem
performances.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bartolini:2014:AFG,
author = "Davide B. Bartolini and Filippo Sironi and Donatella
Sciuto and Marco D. Santambrogio",
title = "Automated Fine-Grained {CPU} Provisioning for Virtual
Machines",
journal = j-TACO,
volume = "11",
number = "3",
pages = "27:1--27:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637480",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Ideally, the pay-as-you-go model of Infrastructure as
a Service (IaaS) clouds should enable users to rent
just enough resources (e.g., CPU or memory bandwidth)
to fulfill their service level objectives (SLOs).
Achieving this goal is hard on current IaaS offers,
which require users to explicitly specify the amount of
resources to reserve; this requirement is nontrivial
for users, because estimating the amount of resources
needed to attain application-level SLOs is often
complex, especially when resources are virtualized and
the service provider colocates virtual machines (VMs)
on host nodes. For this reason, users who deploy VMs
subject to SLOs are usually prone to overprovisioning
resources, thus resulting in inflated business costs.
This article tackles this issue with AutoPro: a runtime
system that enhances IaaS clouds with automated and
fine-grained resource provisioning based on performance
SLOs. Our main contribution with AutoPro is filling the
gap between application-level performance SLOs and
allocation of a contended resource, without requiring
explicit reservations from users. In this article, we
focus on CPU bandwidth allocation to throughput-driven,
compute-intensive multithreaded applications colocated
on a multicore processor; we show that a theoretically
sound, yet simple, control strategy can enable
automated fine-grained allocation of this contended
resource, without the need for offline profiling.
Additionally, AutoPro helps service providers optimize
infrastructure utilization by provisioning idle
resources to best-effort workloads, so as to maximize
node-level utilization. Our extensive experimental
evaluation confirms that AutoPro is able to
automatically determine and enforce allocations to meet
performance SLOs while maximizing node-level
utilization by supporting batch workloads on a
best-effort basis.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carlson:2014:EHL,
author = "Trevor E. Carlson and Wim Heirman and Stijn Eyerman
and Ibrahim Hur and Lieven Eeckhout",
title = "An Evaluation of High-Level Mechanistic Core Models",
journal = j-TACO,
volume = "11",
number = "3",
pages = "28:1--28:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629677",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Large core counts and complex cache hierarchies are
increasing the burden placed on commonly used
simulation and modeling techniques. Although analytical
models provide fast results, they do not apply to
complex, many-core shared-memory systems. In contrast,
detailed cycle-level simulation can be accurate but
also tends to be slow, which limits the number of
configurations that can be evaluated. A middle ground
is needed that provides for fast simulation of complex
many-core processors while still providing accurate
results. In this article, we explore, analyze, and
compare the accuracy and simulation speed of
high-abstraction core models as a potential solution to
slow cycle-level simulation. We describe a number of
enhancements to interval simulation to improve its
accuracy while maintaining simulation speed. In
addition, we introduce the instruction-window centric
(IW-centric) core model, a new mechanistic core model
that bridges the gap between interval simulation and
cycle-accurate simulation by enabling high-speed
simulations with higher levels of detail. We also show
that using accurate core models like these are
important for memory subsystem studies, and that
simple, naive models, like a one-IPC core model, can
lead to misleading and incorrect results and
conclusions in practical design studies. Validation
against real hardware shows good accuracy, with an
average single-core error of 11.1\% and a maximum of
18.8\% for the IW-centric model with a 1.5$ \times $
slowdown compared to interval simulation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hijaz:2014:NLN,
author = "Farrukh Hijaz and Omer Khan",
title = "{NUCA-L1}: a Non-Uniform Access Latency Level-1 Cache
Architecture for Multicores Operating at Near-Threshold
Voltages",
journal = j-TACO,
volume = "11",
number = "3",
pages = "29:1--29:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2631918",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Research has shown that operating in the
near-threshold region is expected to provide up to 10$
\times $ energy efficiency for future processors.
However, reliable operation below a minimum voltage
(Vccmin) cannot be guaranteed due to process
variations. Because SRAM margins can easily be violated
at near-threshold voltages, their bit-cell failure
rates are expected to rise steeply. Multicore
processors rely on fast private L1 caches to exploit
data locality and achieve high performance. In the
presence of high bit-cell fault rates, traditionally an
L1 cache either sacrifices capacity or incurs
additional latency to correct the faults. We observe
that L1 cache sensitivity to hit latency offers a
design trade-off between capacity and latency. When
fault rate is high at extreme Vccmin, it is beneficial
to recover L1 cache capacity, even if it comes at the
cost of additional latency. However, at low fault
rates, the additional constant latency to recover cache
capacity degrades performance. With this trade-off in
mind, we propose a Non-Uniform Cache Access L1
architecture (NUCA-L1) that avoids additional latency
on accesses to fault-free cache lines. To mitigate the
capacity bottleneck, it deploys a correction mechanism
to recover capacity at the cost of additional latency.
Using extensive simulations of a 64-core multicore, we
demonstrate that at various bit-cell fault rates, our
proposed private NUCA-L1 cache architecture performs
better than state-of-the-art schemes, along with a
significant reduction in energy consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Drebes:2014:TAD,
author = "Andi Drebes and Karine Heydemann and Nathalie Drach
and Antoniu Pop and Albert Cohen",
title = "Topology-Aware and Dependence-Aware Scheduling and
Memory Allocation for Task-Parallel Languages",
journal = j-TACO,
volume = "11",
number = "3",
pages = "30:1--30:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2641764",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present a joint scheduling and memory allocation
algorithm for efficient execution of task-parallel
programs on non-uniform memory architecture (NUMA)
systems. Task and data placement decisions are based on
a static description of the memory hierarchy and on
runtime information about intertask communication.
Existing locality-aware scheduling strategies for
fine-grained tasks have strong limitations: they are
specific to some class of machines or applications,
they do not handle task dependences, they require
manual program annotations, or they rely on fragile
profiling schemes. By contrast, our solution makes no
assumption on the structure of programs or on the
layout of data in memory. Experimental results, based
on the OpenStream language, show that locality of
accesses to main memory of scientific applications can
be increased significantly on a 64-core machine,
resulting in a speedup of up to 1.63$ \times $ compared
to a state-of-the-art work-stealing scheduler.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tawa:2014:EEF,
author = "Venkata Kalyan Tawa and Ravi Kasha and Madhu Mutyam",
title = "{EFGR}: an Enhanced Fine Granularity Refresh Feature
for High-Performance {DDR4 DRAM} Devices",
journal = j-TACO,
volume = "11",
number = "3",
pages = "31:1--31:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656340",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-density DRAM devices spend significant time
refreshing the DRAM cells, leading to performance drop.
The JEDEC DDR4 standard provides a Fine Granularity
Refresh (FGR) feature to tackle refresh. Motivated by
the observation that in FGR mode, only a few banks are
involved, we propose an Enhanced FGR (EFGR) feature
that introduces three optimizations to the basic FGR
feature and exposes the bank-level parallelism within
the rank even during the refresh. The first
optimization decouples the nonrefreshing banks. The
second and third optimizations determine the maximum
number of nonrefreshing banks that can be active during
refresh and selectively precharge the banks before
refresh, respectively. Our simulation results show that
the EFGR feature is able to recover almost 56.6\% of
the performance loss incurred due to refresh
operations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yalcin:2014:EEC,
author = "Gulay Yalcin and Oguz Ergin and Emrah Islek and Osman
Sabri Unsal and Adrian Cristal",
title = "Exploiting Existing Comparators for Fine-Grained
Low-Cost Error Detection",
journal = j-TACO,
volume = "11",
number = "3",
pages = "32:1--32:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656341",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Fault tolerance has become a fundamental concern in
computer design, in addition to performance and power.
Although several error detection schemes have been
proposed to discover a faulty core in the system, these
proposals could waste the whole core, including many
error-free structures in it after error detection.
Moreover, many fault-tolerant designs require
additional hardware for data replication or for
comparing the replicated data. In this study, we
provide a low-cost, fine-grained error detection scheme
by exploiting already existing comparators and data
replications in the several pipeline stages such as
issue queue, rename logic, and translation lookaside
buffer. We reduce the vulnerability of the source
register tags in IQ by 60\%, the vulnerability of
instruction TLB by 64\%, the vulnerability of data TLB
by 45\%, and the vulnerability of the register tags of
rename logic by 20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ramachandran:2014:HFR,
author = "Pradeep Ramachandran and Siva Kumar Sastry Hari and
Manlap Li and Sarita V. Adve",
title = "Hardware Fault Recovery for {I/O} Intensive
Applications",
journal = j-TACO,
volume = "11",
number = "3",
pages = "33:1--33:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656342",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With continued process scaling, the rate of hardware
failures in commodity systems is increasing. Because
these commodity systems are highly sensitive to cost,
traditional solutions that employ heavy redundancy to
handle such failures are no longer acceptable owing to
their high associated costs. Detecting such faults by
identifying anomalous software execution and recovering
through checkpoint-and-replay is emerging as a viable
low-cost alternative for future commodity systems. An
important but commonly ignored aspect of such solutions
is ensuring that external outputs to the system are
fault-free. The outputs must be delayed until the
detectors guarantee this, influencing fault-free
performance. The overheads for resiliency must thus be
evaluated while taking these delays into consideration;
prior work has largely ignored this relationship. This
article concerns recovery for I/O intensive
applications from in-core faults. We present a strategy
to buffer external outputs using dedicated hardware and
show that checkpoint intervals previously considered as
acceptable incur exorbitant overheads when hardware
buffering is considered. We then present two techniques
to reduce the checkpoint interval and demonstrate a
practical solution that provides high resiliency while
incurring low overheads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eyerman:2014:MTM,
author = "Stijn Eyerman and Pierre Michaud and Wouter Rogiest",
title = "Multiprogram Throughput Metrics: a Systematic
Approach",
journal = j-TACO,
volume = "11",
number = "3",
pages = "34:1--34:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2663346",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Running multiple programs on a processor aims at
increasing the throughput of that processor. However,
defining meaningful throughput metrics in a simulation
environment is not as straightforward as reporting
execution time. This has led to an ongoing debate on
what forms a meaningful throughput metric for
multiprogram workloads. We present a method to
construct throughput metrics in a systematic way: we
start by expressing assumptions on job size, job
distribution, scheduling, and so forth that together
define a theoretical throughput experiment. The
throughput metric is then the average throughput of
this experiment. Different assumptions lead to
different metrics, so one should be aware of these
assumptions when making conclusions based on results
using a specific metric. Throughput metrics should
always be defined from explicit assumptions, because
this leads to a better understanding of the
implications and limits of the results obtained with
that metric. We elaborate multiple metrics based on
different assumptions. In particular, we identify the
assumptions that lead to the commonly used weighted
speedup and harmonic mean of speedups. Our study
clarifies that they are actual throughput metrics,
which was recently questioned. We also propose some new
throughput metrics, which cannot always be expressed as
a closed formula. We use real experimental data to
characterize metrics and show how they relate to each
other.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nugteren:2015:BAS,
author = "Cedric Nugteren and Henk Corporaal",
title = "{Bones}: an Automatic Skeleton-Based {C-to-CUDA}
Compiler for {GPUs}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2665079",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The shift toward parallel processor architectures has
made programming and code generation increasingly
challenging. To address this programmability challenge,
this article presents a technique to fully
automatically generate efficient and readable code for
parallel processors (with a focus on GPUs). This is
made possible by combining algorithmic skeletons,
traditional compilation, and ``algorithmic species,'' a
classification of program code. Compilation starts by
automatically annotating C code with class information
(the algorithmic species). This code is then fed into
the skeleton-based source-to-source compiler bones to
generate CUDA code. To generate efficient code, bones
also performs optimizations including host-accelerator
transfer optimization and kernel fusion. This results
in a unique approach, integrating a skeleton-based
compiler for the first time into an automated flow. The
benefits are demonstrated experimentally for PolyBench
GPU kernels, showing geometric mean speed-ups of 1.4$
\times $ and 2.4$ \times $ compared to ppcg and
Par4All, and for five Rodinia GPU benchmarks, showing a
gap of only 1.2$ \times $ compared to hand-optimized
code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2015:BOM,
author = "Jue Wang and Xiangyu Dong and Yuan Xie",
title = "Building and Optimizing {MRAM}-Based Commodity
Memories",
journal = j-TACO,
volume = "11",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2667105",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging non-volatile memory technologies such as MRAM
are promising design solutions for energy-efficient
memory architecture, especially for mobile systems.
However, building commodity MRAM by reusing DRAM
designs is not straightforward. The existing memory
interfaces are incompatible with MRAM small page size,
and they fail to leverage MRAM unique properties,
causing unnecessary performance and energy overhead. In
this article, we propose four techniques to enable and
optimize an LPDDRx-compatible MRAM solution: ComboAS to
solve the pin incompatibility; DynLat to avoid
unnecessary access latencies; and EarlyPA and BufW to
further improve performance by exploiting the MRAM
unique features of non-destructive read and independent
write path. Combining all these techniques together, we
boost the MRAM performance by 17\% and provide a
DRAM-compatible MRAM solution consuming 21\% less
energy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Komuravelli:2015:RCH,
author = "Rakesh Komuravelli and Sarita V. Adve and Ching-Tsun
Chou",
title = "Revisiting the Complexity of Hardware Cache Coherence
and Some Implications",
journal = j-TACO,
volume = "11",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2663345",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache coherence is an integral part of shared-memory
systems but is also widely considered to be one of the
most complex parts of such systems. Much prior work has
addressed this complexity and the verification
techniques to prove the correctness of hardware
coherence. Given the new multicore era with increasing
number of cores, there is a renewed debate about
whether the complexity of hardware coherence has been
tamed or whether it should be abandoned in favor of
software coherence. This article revisits the
complexity of hardware cache coherence by verifying a
publicly available, state-of-the-art implementation of
the widely used MESI protocol, using the Mur$ \varphi $
model checking tool. To our surprise, we found six bugs
in this protocol, most of which were hard to analyze
and took several days to fix. To compare the
complexity, we also verified the recently proposed
DeNovo protocol, which exploits disciplined software
programming models. We found three relatively easy to
fix bugs in this less mature protocol. After fixing
these bugs, our verification experiments showed that,
compared to DeNovo, MESI had 15X more reachable states
leading to a 20X increase in verification (model
checking) time. Although we were eventually successful
in verifying the protocols, the tool required making
several simplifying assumptions (e.g., two cores, one
address). Our results have several implications: (1)
they indicate that hardware coherence protocols remain
complex; (2) they reinforce the need for protocol
designers to embrace formal verification tools to
demonstrate correctness of new protocols and
extensions; (3) they reinforce the need for formal
verification tools that are both scalable and usable by
non-expert; and (4) they show that a system based on
hardware-software co-design can offer a simpler
approach for cache coherence, thus reducing the overall
verification effort and allowing verification of more
detailed models and protocol extensions that are
otherwise limited by computing resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rodriguez:2015:VSR,
author = "Gabriel Rodr{\'\i}guez and Juan Touri{\~n}o and Mahmut
T. Kandemir",
title = "Volatile {STT--RAM} Scratchpad Design and Data
Allocation for Low Energy",
journal = j-TACO,
volume = "11",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2669556",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "On-chip power consumption is one of the fundamental
challenges of current technology scaling. Cache
memories consume a sizable part of this power,
particularly due to leakage energy. STT-RAM is one of
several new memory technologies that have been proposed
in order to improve power while preserving performance.
It features high density and low leakage, but at the
expense of write energy and performance. This article
explores the use of STT-RAM--based scratchpad memories
that trade nonvolatility in exchange for faster and
less energetically expensive accesses, making them
feasible for on-chip implementation in embedded
systems. A novel multiretention scratchpad partitioning
is proposed, featuring multiple storage spaces with
different retention, energy, and performance
characteristics. A customized compiler-based allocation
algorithm suitable for use with such a scratchpad
organization is described. Our experiments indicate
that a multiretention STT-RAM scratchpad can provide
energy savings of 53\% with respect to an iso-area,
hardware-managed SRAM cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Camarero:2015:TCH,
author = "Crist{\'o}bal Camarero and Enrique Vallejo and
Ram{\'o}n Beivide",
title = "Topological Characterization of {Hamming} and
Dragonfly Networks and Its Implications on Routing",
journal = j-TACO,
volume = "11",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677038",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Current High-Performance Computing (HPC) and data
center networks rely on large-radix routers. Hamming
graphs (Cartesian products of complete graphs) and
dragonflies (two-level direct networks with nodes
organized in groups) are some direct topologies
proposed for such networks. The original definition of
the dragonfly topology is very loose, with several
degrees of freedom, such as the inter- and intragroup
topology, the specific global connectivity, and the
number of parallel links between groups (or trunking
level). This work provides a comprehensive analysis of
the topological properties of the dragonfly network,
providing balancing conditions for network
dimensioning, as well as introducing and classifying
several alternatives for the global connectivity and
trunking level. From a topological study of the
network, it is noted that a Hamming graph can be seen
as a canonical dragonfly topology with a high level of
trunking. Based on this observation and by carefully
selecting the global connectivity, the Dimension Order
Routing (DOR) mechanism safely used in Hamming graphs
is adapted to dragonfly networks with trunking. The
resulting routing algorithms approximate the
performance of minimal, nonminimal, and adaptive
routings typically used in dragonflies but without
requiring virtual channels to avoid packet deadlock,
thus allowing for lower cost router implementations.
This is obtained by properly selecting the link to
route between groups based on a graph coloring of
network routers. Evaluations show that the proposed
mechanisms are competitive with traditional solutions
when using the same number of virtual channels and
enable for simpler implementations with lower cost.
Finally, multilevel dragonflies are discussed,
considering how the proposed mechanisms could be
adapted to them.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yoon:2015:EDM,
author = "Hanbin Yoon and Justin Meza and Naveen Muralimanohar
and Norman P. Jouppi and Onur Mutlu",
title = "Efficient Data Mapping and Buffering Techniques for
Multilevel Cell Phase-Change Memories",
journal = j-TACO,
volume = "11",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2669365",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "New phase-change memory (PCM) devices have low-access
latencies (like DRAM) and high capacities (i.e., low
cost per bit, like Flash). In addition to being able to
scale to smaller cell sizes than DRAM, a PCM cell can
also store multiple bits per cell (referred to as
multilevel cell, or MLC), enabling even greater
capacity per bit. However, reading and writing the
different bits of data from and to an MLC PCM cell
requires different amounts of time: one bit is read or
written first, followed by another. Due to this
asymmetric access process, the bits in an MLC PCM cell
have different access latency and energy depending on
which bit in the cell is being read or written. We
leverage this observation to design a new way to store
and buffer data in MLC PCM devices. While traditional
devices couple the bits in each cell next to one
another in the address space, our key idea is to
logically decouple the bits in each cell into two
separate regions depending on their read/write
characteristics: fast-read/slow-write bits and
slow-read/fast-write bits. We propose a low-overhead
hardware/software technique to predict and map data
that would benefit from being in each region at
runtime. In addition, we show how MLC bit decoupling
provides more flexibility in the way data is buffered
in the device, enabling more efficient use of existing
device buffer space. Our evaluations for a multicore
system show that MLC bit decoupling improves system
performance by 19.2\%, memory energy efficiency by
14.4\%, and thread fairness by 19.3\% over a
state-of-the-art MLC PCM system that couples the bits
in its cells. We show that our results are consistent
across a variety of workloads and system
configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Premillieu:2015:EOE,
author = "Nathanael Pr{\'e}millieu and Andr{\'e} Seznec",
title = "Efficient Out-of-Order Execution of Guarded {ISAs}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677037",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "ARM ISA-based processors are no longer low-cost,
low-power processors. Nowadays, ARM ISA-based processor
manufacturers are striving to implement medium-end to
high-end processor cores, which implies implementing a
state-of-the-art out-of-order execution engine.
Unfortunately, providing efficient out-of-order
execution on legacy ARM codes may be quite challenging
due to guarded instructions. Predicting the guarded
instructions addresses the main serialization impact
associated with guarded instructions execution and the
multiple definition problem. Moreover, guard prediction
allows one to use a global branch-and-guard history
predictor to predict both branches and guards, often
improving branch prediction accuracy. Unfortunately,
such a global branch-and-guard history predictor
requires the systematic use of guard predictions. In
that case, poor guard prediction accuracy would lead to
poor overall performance on some applications. Building
on top of recent advances in branch prediction and
confidence estimation, we propose a hybrid
branch-and-guard predictor, combining a global branch
history component and global branch-and-guard history
component. The potential gain or loss due to the
systematic use of guard prediction is dynamically
evaluated at runtime. Two computing modes are enabled:
systematic guard prediction use and
high-confidence-only guard prediction use. Our
experiments show that on most applications, an
overwhelming majority of guarded instructions are
predicted. Therefore, a simple but relatively
inefficient hardware solution can be used to execute
the few unpredicted guarded instructions. Significant
performance benefits are observed on most applications,
while applications with poorly predictable guards do
not suffer from performance loss.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2015:APM,
author = "Zheng Wang and Dominik Grewe and Michael F. P.
O'Boyle",
title = "Automatic and Portable Mapping of Data Parallel
Programs to {OpenCL} for {GPU}-Based Heterogeneous
Systems",
journal = j-TACO,
volume = "11",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677036",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "General-purpose GPU-based systems are highly
attractive, as they give potentially massive
performance at little cost. Realizing such potential is
challenging due to the complexity of programming. This
article presents a compiler-based approach to
automatically generate optimized OpenCL code from data
parallel OpenMP programs for GPUs. A key feature of our
scheme is that it leverages existing transformations,
especially data transformations, to improve performance
on GPU architectures and uses automatic machine
learning to build a predictive model to determine if it
is worthwhile running the OpenCL code on the GPU or
OpenMP code on the multicore host. We applied our
approach to the entire NAS parallel benchmark suite and
evaluated it on distinct GPU-based systems. We achieved
average (up to) speedups of $ 4.51 \times $ and $ 4.20
\times $ ($ 143 \times $ and $ 67 \times $) on Core
i7/NVIDIA GeForce GTX580 and Core i7/AMD Radeon 7970
platforms, respectively, over a sequential baseline.
Our approach achieves, on average, greater than $ 10
\times $ speedups over two state-of-the-art automatic
GPU code generators.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{He:2015:IHF,
author = "Dan He and Fang Wang and Hong Jiang and Dan Feng and
Jing Ning Liu and Wei Tong and Zheng Zhang",
title = "Improving Hybrid {FTL} by Fully Exploiting Internal
{SSD} Parallelism with Virtual Blocks",
journal = j-TACO,
volume = "11",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677160",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compared with either block or page-mapping Flash
Translation Layer (FTL), hybrid-mapping FTL for flash
Solid State Disks (SSDs), such as Fully Associative
Section Translation (FAST), has relatively high space
efficiency because of its smaller mapping table than
the latter and higher flexibility than the former. As a
result, hybrid-mapping FTL has become the most commonly
used scheme in SSDs. But the hybrid-mapping FTL incurs
a large number of costly full-merge operations. Thus, a
critical challenge to hybrid-mapping FTL is how to
reduce the cost of full-merge operations and improve
partial merge operations and switch operations. In this
article, we propose a novel FTL scheme, called Virtual
Block-based Parallel FAST (VBP-FAST), that divides
flash area into Virtual Blocks (VBlocks) and Physical
Blocks (PBlocks) where VBlocks are used to fully
exploit channel-level, die-level, and plane-level
parallelism of flash. Leveraging these three levels of
parallelism, the cost of full merge in VBP-FAST is
significantly reduced from that of FAST. In the
meantime, VBP-FAST uses PBlocks to retain the
advantages of partial merge and switch operations. Our
extensive trace-driven simulation results show that
VBP-FAST speeds up FAST by a factor of 5.3--8.4 for
random workloads and of 1.7 for sequential workloads
with channel-level, die-level, and plane-level
parallelism of 8, 2, and 2 (i.e., eight channels, two
dies, and two planes).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rubin:2015:MOM,
author = "Eri Rubin and Ely Levy and Amnon Barak and Tal
Ben-Nun",
title = "{MAPS}: Optimizing Massively Parallel Applications
Using Device-Level Memory Abstraction",
journal = j-TACO,
volume = "11",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2680544",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPUs play an increasingly important role in
high-performance computing. While developing naive code
is straightforward, optimizing massively parallel
applications requires deep understanding of the
underlying architecture. The developer must struggle
with complex index calculations and manual memory
transfers. This article classifies memory access
patterns used in most parallel algorithms, based on
Berkeley's Parallel ``Dwarfs.'' It then proposes the
MAPS framework, a device-level memory abstraction that
facilitates memory access on GPUs, alleviating complex
indexing using on-device containers and iterators. This
article presents an implementation of MAPS and shows
that its performance is comparable to carefully
optimized implementations of real-world applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cilardo:2015:IMM,
author = "Alessandro Cilardo and Luca Gallo",
title = "Improving Multibank Memory Access Parallelism with
Lattice-Based Partitioning",
journal = j-TACO,
volume = "11",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2675359",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging architectures, such as reconfigurable
hardware platforms, provide the unprecedented
opportunity of customizing the memory infrastructure
based on application access patterns. This work
addresses the problem of automated memory partitioning
for such architectures, taking into account potentially
parallel data accesses to physically independent banks.
Targeted at affine static control parts (SCoPs), the
technique relies on the Z-polyhedral model for program
analysis and adopts a partitioning scheme based on
integer lattices. The approach enables the definition
of a solution space including previous works as
particular cases. The problem of minimizing the total
amount of memory required across the partitioned banks,
referred to as storage minimization throughout the
article, is tackled by an optimal approach yielding
asymptotically zero memory waste or, as an alternative,
an efficient approach ensuring arbitrarily small waste.
The article also presents a prototype toolchain and a
detailed step-by-step case study demonstrating the
impact of the proposed technique along with extensive
comparisons with alternative approaches in the
literature.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Martinsen:2015:EPT,
author = "Jan Kasper Martinsen and H{\aa}kan Grahn and Anders
Isberg",
title = "The Effects of Parameter Tuning in Software
Thread-Level Speculation in {JavaScript} Engines",
journal = j-TACO,
volume = "11",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686036",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "JavaScript is a sequential programming language that
has a large potential for parallel execution in Web
applications. Thread-level speculation can take
advantage of this, but it has a large memory overhead.
In this article, we evaluate the effects of adjusting
various parameters for thread-level speculation. Our
results clearly show that thread-level speculation is a
useful technique for taking advantage of multicore
architectures for JavaScript in Web applications, that
nested speculation is required in thread-level
speculation, and that the execution characteristics of
Web applications significantly reduce the needed
memory, the number of threads, and the depth of our
speculation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Colombet:2015:SOS,
author = "Quentin Colombet and Florian Brandner and Alain
Darte",
title = "Studying Optimal Spilling in the Light of {SSA}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685392",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent developments in register allocation, mostly
linked to static single assignment (SSA) form, have
shown the benefits of decoupling the problem in two
phases: a first spilling phase places load and store
instructions so that the register pressure at all
program points is small enough, and a second assignment
and coalescing phase maps the variables to physical
registers and reduces the number of move instructions
among registers. This article focuses on the first
phase, for which many open questions remain: in
particular, we study the notion of optimal spilling
(what can be expressed?) and the impact of SSA form
(does it help?). To identify the important features for
optimal spilling on load-store architectures, we
develop a new integer linear programming formulation,
more accurate and expressive than previous approaches.
Among other features, we can express SSA $ \phi
$-functions, memory-to-memory copies, and the fact that
a value can be stored simultaneously in a register and
in memory. Based on this formulation, we present a
thorough analysis of the results obtained for the
SPECINT 2000 and EEMBC 1.1 benchmarks, from which we
draw, among others, the following conclusions: (1)
rematerialization is extremely important; (2) SSA
complicates the formulation of optimal spilling,
especially because of memory coalescing when the code
is not in conventional SSA (CSSA); (3)
microarchitectural features are significant and thus
have to be accounted for; and (4) significant savings
can be obtained in terms of static spill costs, cache
miss rates, and dynamic instruction counts.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haj-Yihia:2015:CDP,
author = "Jawad Haj-Yihia and Yosi {Ben Asher} and Efraim Rotem
and Ahmad Yasin and Ran Ginosar",
title = "Compiler-Directed Power Management for Superscalars",
journal = j-TACO,
volume = "11",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685393",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern superscalar CPUs contain large complex
structures and diverse execution units, consuming wide
dynamic power range. Building a power delivery network
for the worst-case power consumption is not energy
efficient and often is impossible to fit in small
systems. Instantaneous power excursions can cause
voltage droops. Power management algorithms are too
slow to respond to instantaneous events. In this
article, we propose a novel compiler-directed framework
to address this problem. The framework is validated on
a 4th Generation Intel\reg{} CoreTM processor and with
simulator on output trace. Up to 16\% performance
speedup is measured over baseline for the SPEC CPU2006
benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Trinh:2015:EDE,
author = "Hong-Phuc Trinh and Marc Duranton and Michel
Paindavoine",
title = "Efficient Data Encoding for Convolutional Neural
Network application",
journal = j-TACO,
volume = "11",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685394",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents an approximate data encoding
scheme called Significant Position Encoding (SPE). The
encoding allows efficient implementation of the recall
phase (forward propagation pass) of Convolutional
Neural Networks (CNN)-a typical Feed-Forward Neural
Network. This implementation uses only 7 bits data
representation and achieves almost the same
classification performance compared with the initial
network: on MNIST handwriting recognition task, using
this data encoding scheme losses only 0.03\% in terms
of recognition rate (99.27\% vs. 99.3\%). In terms of
storage, we achieve a 12.5\% gain compared with an 8
bits fixed-point implementation of the same CNN.
Moreover, this data encoding allows efficient
implementation of processing unit thanks to the
simplicity of scalar product operation-the principal
operation in a Feed-Forward Neural Network.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Breugh:2015:MAM,
author = "Maximilien B. Breugh and Stijn Eyerman and Lieven
Eeckhout",
title = "Mechanistic Analytical Modeling of Superscalar
In-Order Processor Performance",
journal = j-TACO,
volume = "11",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2678277",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Superscalar in-order processors form an interesting
alternative to out-of-order processors because of their
energy efficiency and lower design complexity. However,
despite the reduced design complexity, it is nontrivial
to get performance estimates or insight in the
application--microarchitecture interaction without
running slow, detailed cycle-level simulations, because
performance highly depends on the order of instructions
within the application's dynamic instruction stream, as
in-order processors stall on interinstruction
dependences and functional unit contention. To limit
the number of detailed cycle-level simulations needed
during design space exploration, we propose a
mechanistic analytical performance model that is built
from understanding the internal mechanisms of the
processor. The mechanistic performance model for
superscalar in-order processors is shown to be accurate
with an average performance prediction error of 3.2\%
compared to detailed cycle-accurate simulation using
gem5. We also validate the model against hardware,
using the ARM Cortex-A8 processor and show that it is
accurate within 10\% on average. We further demonstrate
the usefulness of the model through three case studies:
(1) design space exploration, identifying the optimum
number of functional units for achieving a given
performance target; (2) program--machine interactions,
providing insight into microarchitecture bottlenecks;
and (3) compiler--architecture interactions,
visualizing the impact of compiler optimizations on
performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Seshadri:2015:MPC,
author = "Vivek Seshadri and Samihan Yedkar and Hongyi Xin and
Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch
and Todd C. Mowry",
title = "Mitigating Prefetcher-Caused Pollution Using Informed
Caching Policies for Prefetched Blocks",
journal = j-TACO,
volume = "11",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677956",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many modern high-performance processors prefetch
blocks into the on-chip cache. Prefetched blocks can
potentially pollute the cache by evicting more useful
blocks. In this work, we observe that both accurate and
inaccurate prefetches lead to cache pollution, and
propose a comprehensive mechanism to mitigate
prefetcher-caused cache pollution. First, we observe
that over 95\% of useful prefetches in a wide variety
of applications are not reused after the first demand
hit (in secondary caches). Based on this observation,
our first mechanism simply demotes a prefetched block
to the lowest priority on a demand hit. Second, to
address pollution caused by inaccurate prefetches, we
propose a self-tuning prefetch accuracy predictor to
predict if a prefetch is accurate or inaccurate. Only
predicted-accurate prefetches are inserted into the
cache with a high priority. Evaluations show that our
final mechanism, which combines these two ideas,
significantly improves performance compared to both the
baseline LRU policy and two state-of-the-art approaches
to mitigating prefetcher-caused cache pollution (up to
49\%, and 6\% on average for 157 two-core
multiprogrammed workloads). The performance improvement
is consistent across a wide variety of system
configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Matheou:2015:ASD,
author = "George Matheou and Paraskevas Evripidou",
title = "Architectural Support for Data-Driven Execution",
journal = j-TACO,
volume = "11",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686874",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The exponential growth of sequential processors has
come to an end, and thus, parallel processing is
probably the only way to achieve performance growth. We
propose the development of parallel architectures based
on data-driven scheduling. Data-driven scheduling
enforces only a partial ordering as dictated by the
true data dependencies, which is the minimum
synchronization possible. This is very beneficial for
parallel processing because it enables it to exploit
the maximum possible parallelism. We provide
architectural support for data-driven execution for the
Data-Driven Multithreading (DDM) model. In the past,
DDM has been evaluated mostly in the form of virtual
machines. The main contribution of this work is the
development of a highly efficient hardware support for
data-driven execution and its integration into a
multicore system with eight cores on a Virtex-6 FPGA.
The DDM semantics make barriers and cache coherence
unnecessary, which reduces the synchronization
latencies significantly and makes the cache simpler.
The performance evaluation has shown that the support
for data-driven execution is very efficient with
negligible overheads. Our prototype can support very
small problem sizes (matrix $ 16 \times 16$) and
ultra-lightweight threads (block of $ 4 \times 4$) that
achieve speedups close to linear. Such results cannot
be achieved by software-based systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Morad:2015:GSP,
author = "Amir Morad and Leonid Yavits and Ran Ginosar",
title = "{GP--SIMD} Processing-in-Memory",
journal = j-TACO,
volume = "11",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686875",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GP-SIMD, a novel hybrid general-purpose SIMD computer
architecture, resolves the issue of data
synchronization by in-memory computing through
combining data storage and massively parallel
processing. GP-SIMD employs a two-dimensional access
memory with modified SRAM storage cells and a
bit-serial processing unit per each memory row. An
analytic performance model of the GP-SIMD architecture
is presented, comparing it to associative processor and
to conventional SIMD architectures. Cycle-accurate
simulation of four workloads supports the analytical
comparison. Assuming a moderate die area, GP-SIMD
architecture outperforms both the associative processor
and conventional SIMD coprocessor architectures by
almost an order of magnitude while consuming less
power.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Schaub:2015:ISW,
author = "Thomas Schaub and Simon Moll and Ralf Karrenberg and
Sebastian Hack",
title = "The Impact of the {SIMD} Width on Control-Flow and
Memory Divergence",
journal = j-TACO,
volume = "11",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687355",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Power consumption is a prevalent issue in current and
future computing systems. SIMD processors amortize the
power consumption of managing the instruction stream by
executing the same instruction in parallel on multiple
data. Therefore, in the past years, the SIMD width has
steadily increased, and it is not unlikely that it will
continue to do so. In this article, we experimentally
study the influence of the SIMD width to the execution
of data-parallel programs. We investigate how an
increasing SIMD width (up to 1024) influences
control-flow divergence and memory-access divergence,
and how well techniques to mitigate them will work on
larger SIMD widths. We perform our study on 76 OpenCL
applications and show that a group of programs scales
well up to SIMD width 1024, whereas another group of
programs increasingly suffers from control-flow
divergence. For those programs, thread regrouping
techniques may become increasingly important for larger
SIMD widths. We show what average speedups can be
expected when increasing the SIMD width. For example,
when switching from scalar execution to SIMD width 64,
one can expect a speedup of 60.11, which increases to
62.46 when using thread regrouping. We also analyze the
frequency of regular (uniform, consecutive) memory
access patterns and observe a monotonic decrease of
regular memory accesses from 82.6 at SIMD width 4 to
43.1\% at SIMD width 1024.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fang:2015:MMD,
author = "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and
Antonia Zhai and James Greensky and Gautham Beeraka and
Binyu Zang",
title = "Measuring Microarchitectural Details of Multi- and
Many-Core Memory Systems through Microbenchmarking",
journal = j-TACO,
volume = "11",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687356",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As multicore and many-core architectures evolve, their
memory systems are becoming increasingly more complex.
To bridge the latency and bandwidth gap between the
processor and memory, they often use a mix of
multilevel private/shared caches that are either
blocking or nonblocking and are connected by high-speed
network-on-chip. Moreover, they also incorporate
hardware and software prefetching and simultaneous
multithreading (SMT) to hide memory latency. On such
multi- and many-core systems, to incorporate various
memory optimization schemes using compiler
optimizations and performance tuning techniques, it is
crucial to have microarchitectural details of the
target memory system. Unfortunately, such details are
often unavailable from vendors, especially for newly
released processors. In this article, we propose a
novel microbenchmarking methodology based on short
elapsed-time events (SETEs) to obtain comprehensive
memory microarchitectural details in multi- and
many-core processors. This approach requires detailed
analysis of potential interfering factors that could
affect the intended behavior of such memory systems. We
lay out effective guidelines to control and mitigate
those interfering factors. Taking the impact of SMT
into consideration, our proposed methodology not only
can measure traditional cache/memory latency and
off-chip bandwidth but also can uncover the details of
software and hardware prefetching units not attempted
in previous studies. Using the newly released Intel
Xeon Phi many-core processor (with in-order cores) as
an example, we show how we can use a set of
microbenchmarks to determine various microarchitectural
features of its memory system (many are undocumented
from vendors). To demonstrate the portability and
validate the correctness of such a methodology, we use
the well-documented Intel Sandy Bridge multicore
processor (with out-of-order cores) as another example,
where most data are available and can be validated.
Moreover, to illustrate the usefulness of the measured
data, we do a multistage coordinated data prefetching
case study on both Xeon Phi and Sandy Bridge and show
that by using the measured data, we can achieve 1.3X
and 1.08X performance speedup, respectively, compared
to the state-of-the-art Intel ICC compiler. We believe
that these measurements also provide useful insights
into memory optimization, analysis, and modeling of
such multicore and many-core architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chi:2015:LPH,
author = "Chi Ching Chi and Mauricio Alvarez-Mesa and Ben
Juurlink",
title = "Low-Power High-Efficiency Video Decoding using
General-Purpose Processors",
journal = j-TACO,
volume = "11",
number = "4",
pages = "56:1--56:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685551",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we investigate how code optimization
techniques and low-power states of general-purpose
processors improve the power efficiency of HEVC
decoding. The power and performance efficiency of the
use of SIMD instructions, multicore architectures, and
low-power active and idle states are analyzed in detail
for offline video decoding. In addition, the power
efficiency of techniques such as ``race to idle'' and
``exploiting slack'' with DVFS are evaluated for
real-time video decoding. Results show that
``exploiting slack'' is more power efficient than
``race to idle'' for all evaluated platforms
representing smartphone, tablet, laptop, and desktop
computing systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luporini:2015:CLO,
author = "Fabio Luporini and Ana Lucia Varbanescu and Florian
Rathgeber and Gheorghe-Teodor Bercea and J. Ramanujam
and David A. Ham and Paul H. J. Kelly",
title = "Cross-Loop Optimization of Arithmetic Intensity for
Finite Element Local Assembly",
journal = j-TACO,
volume = "11",
number = "4",
pages = "57:1--57:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687415",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We study and systematically evaluate a class of
composable code transformations that improve arithmetic
intensity in local assembly operations, which represent
a significant fraction of the execution time in finite
element methods. Their performance optimization is
indeed a challenging issue. Even though affine loop
nests are generally present, the short trip counts and
the complexity of mathematical expressions, which vary
among different problems, make it hard to determine an
optimal sequence of successful transformations. Our
investigation has resulted in the implementation of a
compiler (called COFFEE) for local assembly kernels,
fully integrated with a framework for developing finite
element methods. The compiler manipulates abstract
syntax trees generated from a domain-specific language
by introducing domain-aware optimizations for
instruction-level parallelism and register locality.
Eventually, it produces C code including vector SIMD
intrinsics. Experiments using a range of real-world
finite element problems of increasing complexity show
that significant performance improvement is achieved.
The generality of the approach and the applicability of
the proposed code transformations to other domains is
also discussed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2015:OPS,
author = "Xing Zhou and Mar{\'\i}a J. Garzar{\'a}n and David A.
Padua",
title = "Optimal Parallelogram Selection for Hierarchical
Tiling",
journal = j-TACO,
volume = "11",
number = "4",
pages = "58:1--58:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687414",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Loop tiling is an effective optimization to improve
performance of multiply nested loops, which are the
most time-consuming parts in many programs. Most
massively parallel systems today are organized
hierarchically, and different levels of the hierarchy
differ in the organization of parallelism and the
memory models they adopt. To make better use of these
machines, it is clear that loop nests should be tiled
hierarchically to fit the hierarchical organization of
the machine; however, it is not so clear what should be
the exact form of these hierarchical tiles. In
particular, tile shape selection is of critical
importance to expose parallelism of the tiled loop
nests. Although loop tiling is a well-known
optimization, not much is known about tile shape
selection. In this article, we study tile shape
selection when the shapes are any type of
parallelograms and introduce a model to relate the tile
shape of the hierarchy to the execution time. Using
this model, we implement a system that automatically
finds the tile shapes that minimize the execution time
in a hierarchical system. Our experimental results show
that in several cases, the tiles automatically selected
by our system outperform the most intuitive tiling
schemes usually adopted by programmers because of their
simplicity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Porter:2015:MMS,
author = "Leo Porter and Michael A. Laurenzano and Ananta Tiwari
and Adam Jundt and William A. {Ward, Jr.} and Roy
Campbell and Laura Carrington",
title = "Making the Most of {SMT} in {HPC}: System- and
Application-Level Perspectives",
journal = j-TACO,
volume = "11",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687651",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This work presents an end-to-end methodology for
quantifying the performance and power benefits of
simultaneous multithreading (SMT) for HPC centers and
applies this methodology to a production system and
workload. Ultimately, SMT's value system-wide depends
on whether users effectively employ SMT at the
application level. However, predicting SMT's benefit
for HPC applications is challenging; by doubling the
number of threads, the application's characteristics
may change. This work proposes statistical modeling
techniques to predict the speedup SMT confers to HPC
applications. This approach, accurate to within 8\%,
uses only lightweight, transparent performance monitors
collected during a single run of the application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tong:2015:OMT,
author = "Xin Tong and Toshihiko Koju and Motohiro Kawahito and
Andreas Moshovos",
title = "Optimizing Memory Translation Emulation in Full System
Emulators",
journal = j-TACO,
volume = "11",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686034",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The emulation speed of a full system emulator (FSE)
determines its usefulness. This work quantitatively
measures where time is spent in QEMU [Bellard 2005], an
industrial-strength FSE. The analysis finds that memory
emulation is one of the most heavily exercised emulator
components. For workloads studied, 38.1\% of the
emulation time is spent in memory emulation on average,
even though QEMU implements a software translation
lookaside buffer (STLB) to accelerate dynamic address
translation. Despite the amount of time spent in memory
emulation, there has been no study on how to further
improve its speed. This work analyzes where time is
spent in memory emulation and studies the performance
impact of a number of STLB optimizations. Although
there are several performance optimization techniques
for hardware TLBs, this work finds that the trade-offs
with an STLB are quite different compared to those with
hardware TLBs. As a result, not all hardware TLB
performance optimization techniques are applicable to
STLBs and vice versa. The evaluated STLB optimizations
target STLB lookups, as well as refills, and result in
an average emulator performance improvement of 24.4\%
over the baseline.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kong:2015:CRF,
author = "Martin Kong and Antoniu Pop and Louis-No{\"e}l Pouchet
and R. Govindarajan and Albert Cohen and P.
Sadayappan",
title = "Compiler\slash Runtime Framework for Dynamic Dataflow
Parallelization of Tiled Programs",
journal = j-TACO,
volume = "11",
number = "4",
pages = "61:1--61:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687652",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Task-parallel languages are increasingly popular. Many
of them provide expressive mechanisms for intertask
synchronization. For example, OpenMP 4.0 will integrate
data-driven execution semantics derived from the StarSs
research language. Compared to the more restrictive
data-parallel and fork-join concurrency models, the
advanced features being introduced into task-parallel
models in turn enable improved scalability through load
balancing, memory latency hiding, mitigation of the
pressure on memory bandwidth, and, as a side effect,
reduced power consumption. In this article, we develop
a systematic approach to compile loop nests into
concurrent, dynamically constructed graphs of dependent
tasks. We propose a simple and effective heuristic that
selects the most profitable parallelization idiom for
every dependence type and communication pattern. This
heuristic enables the extraction of interband
parallelism (cross-barrier parallelism) in a number of
numerical computations that range from linear algebra
to structured grids and image processing. The proposed
static analysis and code generation alleviates the
burden of a full-blown dependence resolver to track the
readiness of tasks at runtime. We evaluate our approach
and algorithms in the PPCG compiler, targeting
OpenStream, a representative dataflow task-parallel
language with explicit intertask dependences and a
lightweight runtime. Experimental results demonstrate
the effectiveness of the approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Melot:2015:FCS,
author = "Nicolas Melot and Christoph Kessler and J{\"o}rg
Keller and Patrick Eitschberger",
title = "Fast Crown Scheduling Heuristics for Energy-Efficient
Mapping and Scaling of Moldable Streaming Tasks on
Manycore Systems",
journal = j-TACO,
volume = "11",
number = "4",
pages = "62:1--62:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687653",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Exploiting effectively massively parallel
architectures is a major challenge that stream
programming can help facilitate. We investigate the
problem of generating energy-optimal code for a
collection of streaming tasks that include
parallelizable or moldable tasks on a generic manycore
processor with dynamic discrete frequency scaling.
Streaming task collections differ from classical task
sets in that all tasks are running concurrently, so
that cores typically run several tasks that are
scheduled round-robin at user level in a data-driven
way. A stream of data flows through the tasks and
intermediate results may be forwarded to other tasks,
as in a pipelined task graph. In this article, we
consider crown scheduling, a novel technique for the
combined optimization of resource allocation, mapping,
and discrete voltage/frequency scaling for moldable
streaming task collections in order to optimize energy
efficiency given a throughput constraint. We first
present optimal offline algorithms for separate and
integrated crown scheduling based on integer linear
programming (ILP). We make no restricting assumption
about speedup behavior. We introduce the fast heuristic
Longest Task, Lowest Group (LTLG) as a generalization
of the Longest Processing Time (LPT) algorithm to
achieve a load-balanced mapping of parallel tasks, and
the Height heuristic for crown frequency scaling. We
use them in feedback loop heuristics based on binary
search and simulated annealing to optimize crown
allocation. Our experimental evaluation of the ILP
models for a generic manycore architecture shows that
at least for small and medium-sized streaming task
collections even the integrated variant of crown
scheduling can be solved to optimality by a
state-of-the-art ILP solver within a few seconds. Our
heuristics produce makespan and energy consumption
close to optimality within the limits of the
phase-separated crown scheduling technique and the
crown structure. Their optimization time is longer than
the one of other algorithms we test, but our heuristics
consistently produce better solutions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ruan:2015:TRM,
author = "Wenjia Ruan and Yujie Liu and Michael Spear",
title = "Transactional Read-Modify-Write Without Aborts",
journal = j-TACO,
volume = "11",
number = "4",
pages = "63:1--63:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2688904",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Language-level transactions are said to provide
``atomicity,'' implying that the order of operations
within a transaction should be invisible to concurrent
transactions and thus that independent operations
within a transaction should be safe to execute in any
order. In this article, we present a mechanism for
dynamically reordering memory operations within a
transaction so that read-modify-write operations on
highly contended locations can be delayed until the
very end of the transaction. When integrated with
traditional transactional conflict detection
mechanisms, our approach reduces aborts on hot memory
locations, such as statistics counters, thereby
improving throughput and reducing wasted work. We
present three algorithms for delaying highly contended
read-modify-write operations within transactions, and
we evaluate their impact on throughput for eager and
lazy transactional systems across multiple workloads.
We also discuss complications that arise from the
interaction between our mechanism and the need for
strong language-level semantics, and we propose
algorithmic extensions that prevent errors from
occurring when accesses are aggressively reordered in a
transactional memory implementation with weak
semantics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{UlHuda:2015:UTM,
author = "Zia {Ul Huda} and Ali Jannesari and Felix Wolf",
title = "Using Template Matching to Infer Parallel Design
Patterns",
journal = j-TACO,
volume = "11",
number = "4",
pages = "64:1--64:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2688905",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The triumphant spread of multicore processors over the
past decade increases the pressure on software
developers to exploit the growing amount of parallelism
available in the hardware. However, writing parallel
programs is generally challenging. For sequential
programs, the formulation of design patterns marked a
turning point in software development, boosting
programmer productivity and leading to more reusable
and maintainable code. While the literature is now also
reporting a rising number of parallel design patterns,
programmers confronted with the task of parallelizing
an existing sequential program still struggle with the
question of which parallel pattern to apply where in
their code. In this article, we show how template
matching, a technique traditionally used in the
discovery of sequential design patterns, can also be
used to support parallelization decisions. After
looking for matches in a previously extracted dynamic
dependence graph, we classify code blocks of the input
program according to the structure of the parallel
patterns we find. Based on this information, the
programmer can easily implement the detected pattern
and create a parallel version of his or her program. We
tested our approach with six programs, in which we
successfully detected pipeline and do-all patterns.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Litz:2015:ECA,
author = "Heiner Litz and Ricardo J. Dias and David R.
Cheriton",
title = "Efficient Correction of Anomalies in Snapshot
Isolation Transactions",
journal = j-TACO,
volume = "11",
number = "4",
pages = "65:1--65:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2693260",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Transactional memory systems providing snapshot
isolation enable concurrent access to shared data
without incurring aborts on read-write conflicts.
Reducing aborts is extremely relevant as it leads to
higher concurrency, greater performance, and better
predictability. Unfortunately, snapshot isolation does
not provide serializability as it allows certain
anomalies that can lead to subtle consistency
violations. While some mechanisms have been proposed to
verify the correctness of a program utilizing snapshot
isolation transactions, it remains difficult to repair
incorrect applications. To reduce the programmer's
burden in this case, we present a technique based on
dynamic code and graph dependency analysis that
automatically corrects existing snapshot isolation
anomalies in transactional memory programs. Our
evaluation shows that corrected applications retain the
performance benefits characteristic of snapshot
isolation over conventional transactional memory
systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "65",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bahmann:2015:PRC,
author = "Helge Bahmann and Nico Reissmann and Magnus Jahre and
Jan Christian Meyer",
title = "Perfect Reconstructability of Control Flow from Demand
Dependence Graphs",
journal = j-TACO,
volume = "11",
number = "4",
pages = "66:1--66:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2693261",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Demand-based dependence graphs (DDGs), such as the
(Regionalized) Value State Dependence Graph ((R)VSDG),
are intermediate representations (IRs) well suited for
a wide range of program transformations. They
explicitly model the flow of data and state, and only
implicitly represent a restricted form of control flow.
These features make DDGs especially suitable for
automatic parallelization and vectorization, but cannot
be leveraged by practical compilers without efficient
construction and destruction algorithms. Construction
algorithms remodel the arbitrarily complex control flow
of a procedure to make it amenable to DDG
representation, whereas destruction algorithms
reestablish control flow for generating efficient
object code. Existing literature presents solutions to
both problems, but these impose structural constraints
on the generatable control flow, and omit qualitative
evaluation. The key contribution of this article is to
show that there is no intrinsic structural limitation
in the control flow directly extractable from RVSDGs.
This fundamental result originates from an
interpretation of loop repetition and decision
predicates as computed continuations, leading to the
introduction of the predicate continuation normal form.
We provide an algorithm for constructing RVSDGs in
predicate continuation form, and propose a novel
destruction algorithm for RVSDGs in this form. Our
destruction algorithm can generate arbitrarily complex
control flow; we show this by proving that the original
CFG an RVSDG was derived from can, apart from
overspecific detail, be reconstructed perfectly.
Additionally, we prove termination and correctness of
these algorithms. Furthermore, we empirically evaluate
the performance, the representational overhead at
compile time, and the reduction in branch instructions
compared to existing solutions. In contrast to previous
work, our algorithms impose no additional overhead on
the control flow of the produced object code. To our
knowledge, this is the first scheme that allows the
original control flow of a procedure to be recovered
from a DDG representation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "66",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Elango:2015:URM,
author = "Venmugil Elango and Naser Sedaghati and Fabrice
Rastello and Louis-No{\"e}l Pouchet and J. Ramanujam
and Radu Teodorescu and P. Sadayappan",
title = "On Using the Roofline Model with Lower Bounds on Data
Movement",
journal = j-TACO,
volume = "11",
number = "4",
pages = "67:1--67:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2693656",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The roofline model is a popular approach for ``bound
and bottleneck'' performance analysis. It focuses on
the limits to the performance of processors because of
limited bandwidth to off-chip memory. It models upper
bounds on performance as a function of operational
intensity, the ratio of computational operations per
byte of data moved from/to memory. While operational
intensity can be directly measured for a specific
implementation of an algorithm on a particular target
platform, it is of interest to obtain broader insights
on bottlenecks, where various semantically equivalent
implementations of an algorithm are considered, along
with analysis for variations in architectural
parameters. This is currently very cumbersome and
requires performance modeling and analysis of many
variants. In this article, we address this problem by
using the roofline model in conjunction with upper
bounds on the operational intensity of computations as
a function of cache capacity, derived from lower bounds
on data movement. This enables bottleneck analysis that
holds across all dependence-preserving semantically
equivalent implementations of an algorithm. We
demonstrate the utility of the approach in assessing
fundamental limits to performance and energy efficiency
for several benchmark algorithms across a design space
of architectural variations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "67",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anonymous:2015:LDR,
author = "Anonymous",
title = "List of Distinguished Reviewers {ACM TACO 2014}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "68:1--68:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2714082",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "68",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zimmer:2015:NSM,
author = "Christopher Zimmer and Frank Mueller",
title = "{NoCMsg}: a Scalable Message-Passing Abstraction for
Network-on-Chips",
journal = j-TACO,
volume = "12",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2701426",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The number of cores of contemporary processors is
constantly increasing and thus continues to deliver
ever higher peak performance (following Moore's
transistor law). Yet high core counts present a
challenge to hardware and software alike. Following
this trend, the network-on-chip (NoC) topology has
changed from buses over rings and fully connected
meshes to 2D meshes. This work contributes NoCMsg, a
low-level message-passing abstraction over NoCs, which
is specifically designed for large core counts in 2D
meshes. NoCMsg ensures deadlock-free messaging for
wormhole Manhattan-path routing over the NoC via a
polling-based message abstraction and
non--flow-controlled communication for selective
communication patterns. Experimental results on the
TilePro hardware platform show that NoCMsg can
significantly reduce communication times by up to 86\%
for single packet messages and up to 40\% for larger
messages compared to other NoC-based message
approaches. On the TilePro platform, NoCMsg outperforms
shared memory abstractions by up to 93\% as core counts
and interprocess communication increase. Results for
fully pipelined double-precision numerical codes show
speedups of up to 64\% for message passing over shared
memory at 32 cores. Overall, we observe that shared
memory scales up to about 16 cores on this platform,
whereas message passing performs well beyond that
threshold. These results generalize to similar
NoC-based platforms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Grigorian:2015:ADA,
author = "Beayna Grigorian and Glenn Reinman",
title = "Accelerating Divergent Applications on {SIMD}
Architectures Using Neural Networks",
journal = j-TACO,
volume = "12",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2717311",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The purpose of this research is to find a
neural-network-based solution to the well-known problem
of branch divergence in Single Instruction Multiple
Data (SIMD) architectures. Our approach differs from
existing techniques that handle branch (or
control-flow) divergence, which use costly hardware
modifications, low-utilization masking techniques, or
static prediction methods. As we examine divergent
applications, we characterize the degree of
data-dependent control flow seen in each and isolate
the code regions (or ``kernels'') that cause the most
performance degradation due to branch divergence. We
then train neural networks (NNs) offline to approximate
these kernels and inject the NN computations directly
into the applications as substitutes for the kernels
they approximate. This essentially translates control
flow into nondivergent computation, trading off
precision for performance. As our methodology
manipulates application source code directly, it is
inherently platform agnostic and can be adopted as a
general means for accelerating divergent applications
on data-parallel architectures. In this article, we
present the Neuralizer, an automated software flow for
kernel identification, NN training, and NN integration,
as well as supplementary user-controlled optimization
techniques. Evaluating our approach on a variety of
divergent applications run on a Graphics Processing
Unit (GPU), we on average achieve performance gains of
13.6 $ \times $ and energy savings of 14.8 $ \times $
with 96\% accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Holey:2015:PEC,
author = "Anup Holey and Vineeth Mekkat and Pen-Chung Yew and
Antonia Zhai",
title = "Performance-Energy Considerations for Shared Cache
Management in a Heterogeneous Multicore Processor",
journal = j-TACO,
volume = "12",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2710019",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Heterogeneous multicore processors that integrate CPU
cores and data-parallel accelerators such as graphic
processing unit (GPU) cores onto the same die raise
several new issues for sharing various on-chip
resources. The shared last-level cache (LLC) is one of
the most important shared resources due to its impact
on performance. Accesses to the shared LLC in
heterogeneous multicore processors can be dominated by
the GPU due to the significantly higher number of
concurrent threads supported by the architecture. Under
current cache management policies, the CPU
applications' share of the LLC can be significantly
reduced in the presence of competing GPU applications.
For many CPU applications, a reduced share of the LLC
could lead to significant performance degradation. On
the contrary, GPU applications can tolerate increase in
memory access latency when there is sufficient
thread-level parallelism (TLP). In addition to the
performance challenge, introduction of diverse cores
onto the same die changes the energy consumption
profile and, in turn, affects the energy efficiency of
the processor. In this work, we propose heterogeneous
LLC management (HeLM), a novel shared LLC management
policy that takes advantage of the GPU's tolerance for
memory access latency. HeLM is able to throttle GPU LLC
accesses and yield LLC space to cache-sensitive CPU
applications. This throttling is achieved by allowing
GPU accesses to bypass the LLC when an increase in
memory access latency can be tolerated. The latency
tolerance of a GPU application is determined by the
availability of TLP, which is measured at runtime as
the average number of threads that are available for
issuing. For a baseline configuration with two CPU
cores and four GPU cores, modeled after existing
heterogeneous processor designs, HeLM outperforms least
recently used (LRU) policy by 10.4\%. Additionally,
HeLM also outperforms competing policies. Our
evaluations show that HeLM is able to sustain
performance with varying core mix. In addition to the
performance benefit, bypassing also reduces total
accesses to the LLC, leading to a reduction in the
energy consumption of the LLC module. However, LLC
bypassing has the potential to increase off-chip
bandwidth utilization and DRAM energy consumption. Our
experiments show that HeLM exhibits better energy
efficiency by reducing the ED$^2$ by 18\% over LRU
while impacting only a 7\% increase in off-chip
bandwidth utilization.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Suh:2015:DMR,
author = "Jinho Suh and Chieh-Ting Huang and Michel Dubois",
title = "Dynamic {MIPS} Rate Stabilization for Complex
Processors",
journal = j-TACO,
volume = "12",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2714575",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern microprocessor cores reach their high
performance levels with the help of high clock rates,
parallel and speculative execution of a large number of
instructions, and vast cache hierarchies. Modern cores
also have adaptive features to regulate power and
temperature and avoid thermal emergencies. All of these
features contribute to highly unpredictable execution
times. In this article, we demonstrate that the
execution time of in-order (IO), out-of-order (OoO),
and OoO simultaneous multithreaded processors can be
stable and predictable by stabilizing their mega
instructions executed per second (MIPS) rate via a
proportional, integral, and differential (PID) gain
feedback controller and dynamic voltage and frequency
scaling (DVFS). Processor cores in idle cycles are
continuously consuming power, which is highly
undesirable in systems, especially in real-time
systems. In addition to meeting deadlines in real-time
systems, our MIPS rate stabilization framework can be
applied on top of it to reduce power and energy by
avoiding idle cycles. If processors are equipped with
MIPS rate stabilization, the execution time can be
predicted. Because the MIPS rate remains steady, a
stabilized processor meets deadlines on time in
real-time systems or in systems with quality-of-service
execution latency requirements at the lowest possible
frequency. To demonstrate and evaluate this capability,
we have selected a subset of the MiBench benchmarks
with the widest execution rate variations. We stabilize
their MIPS rate on a 1GHz Pentium III--like OoO
single-thread microarchitecture, a 1.32GHz
StrongARM-like IO microarchitecture, and the 1GHz OoO
processor augmented with two-way and four-way
simultaneous multithreading. Both IO and OoO cores can
take advantage of the stabilization framework, but the
energy per instruction of the stabilized OoO core is
less because it runs at a lower frequency to meet the
same deadlines. The MIPS rate stabilization of complex
processors using a PID feedback control loop is a
general technique applicable to environments in which
lower power or energy coupled with steady, predictable
performance are desirable, although we target more
specifically real-time systems in this article.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Karimi:2015:MMA,
author = "Naghmeh Karimi and Arun Karthik Kanuparthi and Xueyang
Wang and Ozgur Sinanoglu and Ramesh Karri",
title = "{MAGIC}: Malicious Aging in Circuits\slash Cores",
journal = j-TACO,
volume = "12",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2724718",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The performance of an IC degrades over its lifetime,
ultimately resulting in IC failure. In this article, we
present a hardware attack (called MAGIC) to maliciously
accelerate NBTI aging effects in cores. In this attack,
we identify the input patterns that maliciously age the
pipestages of a core. We then craft a program that
generates these patterns at the inputs of the targeted
pipestage. We demonstrate the MAGIC-based attack on the
OpenSPARC processor. Executing this program
dramatically accelerates the aging process and degrades
the processor's performance by 10.92\% in 1 month,
bypassing existing aging mitigation and timing-error
correction schemes. We also present two low-cost
techniques to thwart the proposed attack.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{DeOliveiraCastro:2015:CLB,
author = "Pablo {De Oliveira Castro} and Chadi Akel and Eric
Petit and Mihail Popov and William Jalby",
title = "{CERE}: {LLVM}-Based {Codelet Extractor and REplayer}
for Piecewise Benchmarking and Optimization",
journal = j-TACO,
volume = "12",
number = "1",
pages = "6:1--6:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2724717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents Codelet Extractor and REplayer
(CERE), an open-source framework for code isolation.
CERE finds and extracts the hotspots of an application
as isolated fragments of code, called codelets.
Codelets can be modified, compiled, run, and measured
independently from the original application. Code
isolation reduces benchmarking cost and allows
piecewise optimization of an application. Unlike
previous approaches, CERE isolates codes at the
compiler Intermediate Representation (IR) level.
Therefore CERE is language agnostic and supports many
input languages such as C, C++, Fortran, and D. CERE
automatically detects codelets invocations that have
the same performance behavior. Then, it selects a
reduced set of representative codelets and invocations,
much faster to replay, which still captures accurately
the original application. In addition, CERE supports
recompiling and retargeting the extracted codelets.
Therefore, CERE can be used for cross-architecture
performance prediction or piecewise code optimization.
On the SPEC 2006 FP benchmarks, CERE codelets cover
90.9\% and accurately replay 66.3\% of the execution
time. We use CERE codelets in a realistic study to
evaluate three different architectures on the NAS
benchmarks. CERE accurately estimates each architecture
performance and is 7.3 $ \times $ to 46.6 $ \times $
cheaper than running the full benchmark.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gaster:2015:HRA,
author = "Benedict R. Gaster and Derek Hower and Lee Howes",
title = "{HRF}-Relaxed: Adapting {HRF} to the Complexities of
Industrial Heterogeneous Memory Models",
journal = j-TACO,
volume = "12",
number = "1",
pages = "7:1--7:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2701618",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory consistency models, or memory models, allow
both programmers and program language implementers to
reason about concurrent accesses to one or more memory
locations. Memory model specifications balance the
often conflicting needs for precise semantics,
implementation flexibility, and ease of understanding.
Toward that end, popular programming languages like
Java, C, and C++ have adopted memory models built on
the conceptual foundation of Sequential Consistency for
Data-Race-Free programs (SC for DRF). These SC for DRF
languages were created with general-purpose homogeneous
CPU systems in mind, and all assume a single, global
memory address space. Such a uniform address space is
usually power and performance prohibitive in
heterogeneous Systems on Chips (SoCs), and for that
reason most heterogeneous languages have adopted split
address spaces and operations with nonglobal
visibility. There have recently been two attempts to
bridge the disconnect between the CPU-centric
assumptions of the SC for DRF framework and the
realities of heterogeneous SoC architectures. Hower et
al. proposed a class of Heterogeneous-Race-Free (HRF)
memory models that provide a foundation for
understanding many of the issues in heterogeneous
memory models. At the same time, the Khronos Group
developed the OpenCL 2.0 memory model that builds on
the C++ memory model. The OpenCL 2.0 model includes
features not addressed by HRF: primarily support for
relaxed atomics and a property referred to as scope
inclusion. In this article, we generalize HRF to allow
formalization of and reasoning about more complicated
models using OpenCL 2.0 as a point of reference. With
that generalization, we (1) make the OpenCL 2.0 memory
model more accessible by introducing a platform for
feature comparisons to other models, (2) consider a
number of shortcomings in the current OpenCL 2.0 model,
and (3) propose changes that could be adopted by future
OpenCL 2.0 revisions or by other, related, models.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Streit:2015:GTP,
author = "Kevin Streit and Johannes Doerfert and Clemens
Hammacher and Andreas Zeller and Sebastian Hack",
title = "Generalized Task Parallelism",
journal = j-TACO,
volume = "12",
number = "1",
pages = "8:1--8:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2723164",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 16 18:39:56 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Existing approaches to automatic parallelization
produce good results in specific domains. Yet, it is
unclear how to integrate their individual strengths to
match the demands and opportunities of complex
software. This lack of integration has both practical
reasons, as integrating those largely differing
approaches into one compiler would impose an
engineering hell, as well as theoretical reasons, as no
joint cost model exists that would drive the choice
between parallelization methods. By reducing the
problem of generating parallel code from a program
dependence graph to integer linear programming, {\em
generalized task parallelization\/} integrates central
aspects of existing parallelization approaches into a
single unified framework. Implemented on top of LLVM,
the framework seamlessly integrates enabling
technologies such as speculation, privatization, and
the realization of reductions. Evaluating our
implementation on various C programs from different
domains, we demonstrate the effectiveness and
generality of generalized task parallelization. On a
quad-core machine with hyperthreading we achieve
speedups of up to $ 4.6 \times $.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tabkhi:2015:JSH,
author = "Hamed Tabkhi and Gunar Schirner",
title = "A Joint {SW\slash HW} Approach for Reducing Register
File Vulnerability",
journal = j-TACO,
volume = "12",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2733378",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Register File (RF) is a particularly vulnerable
component within processor core and at the same time a
hotspot with high power density. To reduce RF
vulnerability, conventional HW-only approaches such as
Error Correction Codes (ECCs) or modular redundancies
are not suitable due to their significant power
overhead. Conversely, SW-only approaches either have
limited improvement on RF reliability or require
considerable performance overhead. As a result, new
approaches are needed that reduce RF vulnerability with
minimal power and performance overhead. This article
introduces Application-guided Reliability-enhanced
Register file Architecture (ARRA), a novel approach to
reduce RF vulnerability of embedded processors. Taking
advantage of uneven register utilization, ARRA mirrors,
guided by a SW instrumentation, frequently used active
registers into passive registers. ARRA is particularly
suitable for control applications, as they have a high
reliability demand with fairly low (uneven) RF
utilization. ARRA is a cross-layer joint HW/SW approach
based on an ARRA-extended RF microarchitecture, an ISA
extension, as well as static binary analysis and
instrumentation. We evaluate ARRA benefits using an
ARRA-enhanced Blackfin processor executing a set of
DSPBench and MiBench benchmarks. We quantify the
benefits using RF Vulnerability Factor (RFVF) and Mean
Work To Failure (MWTF). ARRA significantly reduces RFVF
from 35\% to 6.9\% in cost of 0.5\% performance lost
for control applications. With ARRA's register
mirroring, it can also correct Multiple Bit Upsets
(MBUs) errors, achieving an 8x increase in MWTF.
Compared to a partially ECC-protected RF approach, ARRA
demonstrates higher efficiency by achieving comparable
vulnerability reduction at much lower power
consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kanuparthi:2015:RIC,
author = "Arun Kanuparthi and Ramesh Karri",
title = "Reliable Integrity Checking in Multicore Processors",
journal = j-TACO,
volume = "12",
number = "2",
pages = "10:1--10:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2738052",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Security and reliability have become important
concerns in the design of computer systems. On one
hand, microarchitectural enhancements for security
(such as for dynamic integrity checking of code at
runtime) have been proposed. On the other hand,
independently, microarchitectural enhancements for
reliability to detect and tolerate natural faults have
also been proposed. A fault in these security
enhancements due to alpha particles or aging might
potentially pass off maliciously modified instructions
as safe, rendering the security enhancements useless.
Deliberate fault attacks by attackers can be launched
to disable the security enhancements and then launch
the well-known security attacks that would otherwise
have been detected by these enhancements. We report an
integrated microarchitecture support for security and
reliability in multicore processors. Specifically, we
add integrity checkers to protect the code running on
the multiple cores in a multicore processor. We then
adapt these checkers to check one another periodically
to ensure reliable operation. These checkers naturally
can check the other parts of the core. The average
performance, power, and area costs for these
security-reliability enhancements are 6.42\%, 0.73\%,
and 0.53\%, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2015:NMD,
author = "Do-Heon Lee and Su-Kyung Yoon and Jung-Geun Kim and
Charles C. Weems and Shin-Dug Kim",
title = "A New Memory-Disk Integrated System with {HW}
Optimizer",
journal = j-TACO,
volume = "12",
number = "2",
pages = "11:1--11:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2738053",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Current high-performance computer systems utilize a
memory hierarchy of on-chip cache, main memory, and
secondary storage due to differences in device
characteristics. Limiting the amount of main memory
causes page swap operations and duplicates data between
the main memory and the storage device. The
characteristics of next-generation memory, such as
nonvolatility, byte addressability, and scaling to
greater capacity, can be used to solve these problems.
Simple replacement of secondary storage with new forms
of nonvolatile memory in a traditional memory hierarchy
still causes typical problems, such as memory
bottleneck, page swaps, and write overhead. Thus, we
suggest a single architecture that merges the main
memory and secondary storage into a system called a
Memory-Disk Integrated System (MDIS). The MDIS
architecture is composed of a virtually decoupled NVRAM
and a nonvolatile memory performance optimizer
combining hardware and software to support this system.
The virtually decoupled NVRAM module can support
conventional main memory and disk storage operations
logically without data duplication and can reduce write
operations to the NVRAM. To increase the lifetime and
optimize the performance of this NVRAM, another
hardware module called a Nonvolatile Performance
Optimizer (NVPO) is used that is composed of four small
buffers. The NVPO exploits spatial and temporal
characteristics of static/dynamic data based on program
execution characteristics. Enhanced virtual memory
management and address translation modules in the
operating system can support these hardware components
to achieve a seamless memory-storage environment. Our
experimental results show that the proposed
architecture can improve execution time by about 89\%
over a conventional DRAM main memory/HDD storage
system, and 77\% over a state-of-the-art PRAM main
memory/HDD disk system with DRAM buffer. Also, the
lifetime of the virtually decoupled NVRAM is estimated
to be 40\% longer than that of a traditional hierarchy
based on the same device technology.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kafshdooz:2015:DSS,
author = "Morteza Mohajjel Kafshdooz and Alireza Ejlali",
title = "Dynamic Shared {SPM} Reuse for Real-Time Multicore
Embedded Systems",
journal = j-TACO,
volume = "12",
number = "2",
pages = "12:1--12:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2738051",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Allocating the scratchpad memory (SPM) space to tasks
is a challenging problem in real-time multicore
embedded systems that use shared SPM. Proper SPM space
allocation is important, as it considerably influences
the application worst-case execution time (WCET), which
is of great importance in real-time applications. To
address this problem, in this article we present a
dynamic SPM reuse scheme, where SPM space can be reused
by other tasks during runtime without requiring any
static SPM partitioning. Although the proposed scheme
is applied dynamically at runtime, the required
decision making is fairly complex and hence cannot be
performed at runtime. We have developed techniques to
perform the decision making offline at design time in
the form of optimization problems combined with task
scheduling/mapping. The proposed work is unlike
previous works that either exploit static schemes for
SPM space allocation or perform task scheduling/mapping
and SPM space allocation incoherently. The experimental
results show that our dynamic SPM reuse scheme can
reduce WCET by up to 55\% as compared to recent
previous works on SPM allocation in real-time multicore
embedded systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jia:2015:GPP,
author = "Wenhao Jia and Elba Garza and Kelly A. Shaw and
Margaret Martonosi",
title = "{GPU} Performance and Power Tuning Using Regression
Trees",
journal = j-TACO,
volume = "12",
number = "2",
pages = "13:1--13:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2736287",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPU performance and power tuning is difficult,
requiring extensive user expertise and time-consuming
trial and error. To accelerate design tuning,
statistical design space exploration methods have been
proposed. This article presents Starchart, a novel
design space partitioning tool that uses regression
trees to approach GPU tuning problems. Improving on
prior work, Starchart offers more automation in
identifying key design trade-offs and models design
subspaces with distinctly different behaviors.
Starchart achieves good model accuracy using very few
random samples: less than 0.3\% of a given design
space; iterative sampling can more quickly target
subspaces of interest.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pananilath:2015:OCG,
author = "Irshad Pananilath and Aravind Acharya and Vinay
Vasista and Uday Bondhugula",
title = "An Optimizing Code Generator for a Class of
Lattice-{Boltzmann} Computations",
journal = j-TACO,
volume = "12",
number = "2",
pages = "14:1--14:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2739047",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Lattice-Boltzmann method (LBM), a promising new
particle-based simulation technique for complex and
multiscale fluid flows, has seen tremendous adoption in
recent years in computational fluid dynamics. Even with
a state-of-the-art LBM solver such as Palabos, a user
has to still manually write the program using
library-supplied primitives. We propose an automated
code generator for a class of LBM computations with the
objective to achieve high performance on modern
architectures. Few studies have looked at time tiling
for LBM codes. We exploit a key similarity between
stencils and LBM to enable polyhedral optimizations and
in turn time tiling for LBM. We also characterize the
performance of LBM with the Roofline performance model.
Experimental results for standard LBM simulations like
Lid Driven Cavity, Flow Past Cylinder, and Poiseuille
Flow show that our scheme consistently outperforms
Palabos-on average by up to $ 3 \times $ while running
on 16 cores of an Intel Xeon (Sandybridge). We also
obtain an improvement of $ 2.47 \times $ on the SPEC
LBM benchmark.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fang:2015:PIO,
author = "Shuangde Fang and Wenwen Xu and Yang Chen and Lieven
Eeckhout and Olivier Temam and Yunji Chen and Chengyong
Wu and Xiaobing Feng",
title = "Practical Iterative Optimization for the Data Center",
journal = j-TACO,
volume = "12",
number = "2",
pages = "15:1--15:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2739048",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Iterative optimization is a simple but powerful
approach that searches the best possible combination of
compiler optimizations for a given workload. However,
iterative optimization is plagued by several practical
issues that prevent it from being widely used in
practice: a large number of runs are required to find
the best combination, the optimum combination is
dataset dependent, and the exploration process incurs
significant overhead that needs to be compensated for
by performance benefits. Therefore, although iterative
optimization has been shown to have a significant
performance potential, it seldom is used in production
compilers. In this article, we propose iterative
optimization for the data center (IODC): we show that
the data center offers a context in which all of the
preceding hurdles can be overcome. The basic idea is to
spawn different combinations across workers and
recollect performance statistics at the master, which
then evolves to the optimum combination of compiler
optimizations. IODC carefully manages costs and
benefits, and it is transparent to the end user. To
bring IODC to practice, we evaluate it in the presence
of co-runners to better reflect real-life data center
operation with multiple applications co-running per
server. We enhance IODC with the capability to find
compatible co-runners along with a mechanism to
dynamically adjust the level of aggressiveness to
improve its robustness in the presence of co-running
applications. We evaluate IODC using both MapReduce and
compute-intensive throughput server applications. To
reflect the large number of users interacting with the
system, we gather a very large collection of datasets
(up to hundreds of millions of unique datasets per
program), for a total storage of 16.4TB and 850 days of
CPU time. We report an average performance improvement
of $ 1.48 \times $ and up to $ 2.08 \times $ for five
MapReduce applications, and $ 1.12 \times $ and up to $
1.39 \times $ for nine server applications.
Furthermore, our experiments demonstrate that IODC is
effective in the presence of co-runners, improving
performance by greater than 13\% compared to the worst
possible co-runner schedule.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2015:BSS,
author = "Tao Zhang and Naifeng Jing and Kaiming Jiang and Wei
Shu and Min-You Wu and Xiaoyao Liang",
title = "{Buddy SM}: Sharing Pipeline Front-End for Improved
Energy Efficiency in {GPGPUs}",
journal = j-TACO,
volume = "12",
number = "2",
pages = "16:1--16:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2744202",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A modern general-purpose graphics processing unit
(GPGPU) usually consists of multiple streaming
multiprocessors (SMs), each having a pipeline that
incorporates a group of threads executing a common
instruction flow. Although SMs are designed to work
independently, we observe that they tend to exhibit
very similar behavior for many workloads. If multiple
SMs can be grouped and work in the lock-step manner, it
is possible to save energy by sharing the front-end
units among multiple SMs, including the instruction
fetch, decode, and schedule components. However, such
sharing brings architectural challenges and sometime
causes performance degradation. In this article, we
show our design, implementation, and evaluation for
such an architecture, which we call Buddy SM.
Specifically, multiple SMs can be opportunistically
grouped into a buddy cluster. One SM becomes the
master, and the rest become the slaves. The front-end
unit of the master works actively for itself as well as
for the slaves, whereas the front-end logics of the
slaves are power gated. For efficient flow control and
program correctness, the proposed architecture can
identify unfavorable conditions and ungroup the buddy
cluster when necessary. We analyze various techniques
to improve the performance and energy efficiency of
Buddy SM. Detailed experiments manifest that 37.2\%
front-end and 7.5\% total GPU energy reduction can be
achieved.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cheng:2015:ECS,
author = "Hsiang-Yun Cheng and Matt Poremba and Narges Shahidi
and Ivan Stalev and Mary Jane Irwin and Mahmut Kandemir
and Jack Sampson and Yuan Xie",
title = "{EECache}: a Comprehensive Study on the Architectural
Design for Energy-Efficient Last-Level Caches in Chip
Multiprocessors",
journal = j-TACO,
volume = "12",
number = "2",
pages = "17:1--17:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2756552",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Power management for large last-level caches (LLCs) is
important in chip multiprocessors (CMPs), as the
leakage power of LLCs accounts for a significant
fraction of the limited on-chip power budget. Since not
all workloads running on CMPs need the entire cache,
portions of a large, shared LLC can be disabled to save
energy. In this article, we explore different design
choices, from circuit-level cache organization to
microarchitectural management policies, to propose a
low-overhead runtime mechanism for energy reduction in
the large, shared LLC. We first introduce a slice-based
cache organization that can shut down parts of the
shared LLC with minimal circuit overhead. Based on this
slice-based organization, part of the shared LLC can be
turned off according to the spatial and temporal cache
access behavior captured by low-overhead sampling-based
hardware. In order to eliminate the performance
penalties caused by flushing data before powering off a
cache slice, we propose data migration policies to
prevent the loss of useful data in the LLC. Results
show that our energy-efficient cache design (EECache)
provides 14.1\% energy savings at only 1.2\%
performance degradation and consumes negligible
hardware overhead compared to prior work.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Suresh:2015:IFM,
author = "Arjun Suresh and Bharath Narasimha Swamy and Erven
Rohou and Andr{\'e} Seznec",
title = "Intercepting Functions for Memoization: a Case Study
Using Transcendental Functions",
journal = j-TACO,
volume = "12",
number = "2",
pages = "18:1--18:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2751559",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memoization is the technique of saving the results of
executions so that future executions can be omitted
when the input set repeats. Memoization has been
proposed in previous literature at the instruction,
basic block, and function levels using hardware, as
well as pure software-level approaches including
changes to programming language. In this article, we
focus on software memoization for procedural languages
such as C and Fortran at the granularity of a function.
We propose a simple linker-based technique for enabling
software memoization of any dynamically linked pure
function by function interception and illustrate our
framework using a set of computationally expensive pure
functions-the transcendental functions. Transcendental
functions are those that cannot be expressed in terms
of a finite sequence of algebraic operations
(trigonometric functions, exponential functions, etc.)
and hence are computationally expensive. Our technique
does not need the availability of source code and thus
can even be applied to commercial applications, as well
as applications with legacy codes. As far as users are
concerned, enabling memoization is as simple as setting
an environment variable. Our framework does not make
any specific assumptions about the underlying
architecture or compiler toolchains and can work with a
variety of current architectures. We present
experimental results for a x86-64 platform using both
gcc and icc compiler toolchains, and an ARM Cortex-A9
platform using gcc. Our experiments include a mix of
real-world programs and standard benchmark suites: SPEC
and Splash2x. On standard benchmark applications that
extensively call the transcendental functions, we
report memoization benefits of up to 50\% on Intel Ivy
Bridge and up to 10\% on ARM Cortex-A9. Memoization was
able to regain a performance loss of 76\% in bwaves due
to a known performance bug in the GNU implementation of
the pow function. The same benchmark on ARM Cortex-A9
benefited by more than 200\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2015:SSE,
author = "Chung-Hsiang Lin and De-Yu Shen and Yi-Jung Chen and
Chia-Lin Yang and Cheng-Yuan Michael Wang",
title = "{SECRET}: a Selective Error Correction Framework for
Refresh Energy Reduction in {DRAMs}",
journal = j-TACO,
volume = "12",
number = "2",
pages = "19:1--19:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2747876",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "DRAMs are used as the main memory in most computing
systems today. Studies show that DRAMs contribute to a
significant part of overall system power consumption.
One of the main challenges in low-power DRAM design is
the inevitable refresh process. Due to process
variation, memory cells exhibit retention time
variations. Current DRAMs use a single refresh period
determined by the cell with the largest leakage. Since
prolonging refresh intervals introduces retention
errors, a set of previous works adopt conventional
error-correcting code (ECC) to correct retention
errors. However, these approaches introduce significant
area and energy overheads. In this article, we propose
a novel error correction framework for retention errors
in DRAMs, called SECRET (selective error correction for
refresh energy reduction). The key observations we make
are that retention errors are hard errors rather than
soft errors, and only few DRAM cells have large
leakage. Therefore, instead of equipping error
correction capability for all memory cells as existing
ECC schemes, we only allocate error correction
information to leaky cells under a refresh interval.
Our SECRET framework contains two parts: an offline
phase to identify memory cells with retention errors
given a target error rate and a low-overhead error
correction mechanism. The experimental results show
that among all test cases performed, the proposed
SECRET framework can reduce refresh power by 87.2\% and
overall DRAM power up to 18.57\% with negligible area
and performance overheads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Simon:2015:STH,
author = "Doug Simon and Christian Wimmer and Bernhard Urban and
Gilles Duboscq and Lukas Stadler and Thomas
W{\"u}rthinger",
title = "Snippets: Taking the High Road to a Low Level",
journal = j-TACO,
volume = "12",
number = "2",
pages = "20:1--20:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2764907",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "When building a compiler for a high-level language,
certain intrinsic features of the language must be
expressed in terms of the resulting low-level
operations. Complex features are often expressed by
explicitly weaving together bits of low-level IR, a
process that is tedious, error prone, difficult to
read, difficult to reason about, and machine dependent.
In the Graal compiler for Java, we take a different
approach: we use snippets of Java code to express
semantics in a high-level, architecture-independent
way. Two important restrictions make snippets feasible
in practice: they are compiler specific, and they are
explicitly prepared and specialized. Snippets make
Graal simpler and more portable while still capable of
generating machine code that can compete with other
compilers of the Java HotSpot VM.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Balasubramanian:2015:EGL,
author = "Raghuraman Balasubramanian and Vinay Gangadhar and
Ziliang Guo and Chen-Han Ho and Cherin Joseph and
Jaikrishnan Menon and Mario Paulo Drumond and Robin
Paul and Sharath Prasad and Pradip Valathol and
Karthikeyan Sankaralingam",
title = "Enabling {GPGPU} Low-Level Hardware Explorations with
{MIAOW}: an Open-Source {RTL} Implementation of a
{GPGPU}",
journal = j-TACO,
volume = "12",
number = "2",
pages = "21:1--21:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2764908",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphic processing unit (GPU)-based general-purpose
computing is developing as a viable alternative to
CPU-based computing in many domains. Today's tools for
GPU analysis include simulators like GPGPU-Sim,
Multi2Sim, and Barra. While useful for modeling
first-order effects, these tools do not provide a
detailed view of GPU microarchitecture and physical
design. Further, as GPGPU research evolves, design
ideas and modifications demand detailed estimates of
impact on overall area and power. Fueled by this need,
we introduce MIAOW (Many-core Integrated Accelerator Of
Wisconsin), an open-source RTL implementation of the
AMD Southern Islands GPGPU ISA, capable of running
unmodified OpenCL-based applications. We present our
design motivated by our goals to create a realistic,
flexible, OpenCL-compatible GPGPU, capable of emulating
a full system. We first explore if MIAOW is realistic
and then use four case studies to show that MIAOW
enables the following: physical design perspective to
``traditional'' microarchitecture, new types of
research exploration, and validation/calibration of
simulator-based characterization of hardware. The
findings and ideas are contributions in their own
right, in addition to MIAOW's utility as a tool for
others' research.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2015:LAW,
author = "Quan Chen and Minyi Guo",
title = "Locality-Aware Work Stealing Based on Online Profiling
and Auto-Tuning for Multisocket Multicore
Architectures",
journal = j-TACO,
volume = "12",
number = "2",
pages = "22:1--22:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2766450",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern mainstream powerful computers adopt multisocket
multicore CPU architecture and NUMA-based memory
architecture. While traditional work-stealing
schedulers are designed for single-socket
architectures, they incur severe shared cache misses
and remote memory accesses in these computers. To solve
the problem, we propose a locality-aware work-stealing
(LAWS) scheduler, which better utilizes both the shared
cache and the memory system. In LAWS, a load-balanced
task allocator is used to evenly split and store the
dataset of a program to all the memory nodes and
allocate a task to the socket where the local memory
node stores its data for reducing remote memory
accesses. Then, an adaptive DAG packer adopts an
auto-tuning approach to optimally pack an execution DAG
into cache-friendly subtrees. After cache-friendly
subtrees are created, every socket executes
cache-friendly subtrees sequentially for optimizing
shared cache usage. Meanwhile, a triple-level
work-stealing scheduler is applied to schedule the
subtrees and the tasks in each subtree. Through
theoretical analysis, we show that LAWS has comparable
time and space bounds compared with traditional
work-stealing schedulers. Experimental results show
that LAWS can improve the performance of memory-bound
programs up to 54.2\% on AMD-based experimental
platforms and up to 48.6\% on Intel-based experimental
platforms compared with traditional work-stealing
schedulers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Das:2015:SBP,
author = "Madan Das and Gabriel Southern and Jose Renau",
title = "Section-Based Program Analysis to Reduce Overhead of
Detecting Unsynchronized Thread Communication",
journal = j-TACO,
volume = "12",
number = "2",
pages = "23:1--23:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2766451",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Most systems that test and verify parallel programs,
such as deterministic execution engines, data race
detectors, and software transactional memory systems,
require instrumenting loads and stores in an
application. This can cause a very significant runtime
and memory overhead compared to executing
uninstrumented code. Multithreaded programming
typically allows any thread to perform loads and stores
to any location in the process's address space
independently, and such tools monitor all these memory
accesses. However, many of the addresses in these
unsynchronized memory accesses are only used by a
single thread and do not affect other executing
threads. We propose Section-Based Program Analysis
(SBPA), a novel way to decompose the program into
disjoint code sections to identify and eliminate
instrumenting such loads and stores during program
compilation so that the program runtime overhead is
significantly reduced. Our analysis includes
improvements to pointer analysis and uses a few user
directives to increase the effectiveness of SBPA
further. We implemented SBPA for a deterministic
execution runtime environment and were able to
eliminate 51\% of dynamic memory access
instrumentations. When combined with directives, such
reduction increased to 63\%. We also integrated SBPA
with ThreadSanitizer, a state-of-the-art dynamic race
detector, and achieved a speedup of 2.43 (2.74 with
directives) on a geometric mean basis.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lotfi:2015:AAC,
author = "Atieh Lotfi and Abbas Rahimi and Luca Benini and
Rajesh K. Gupta",
title = "Aging-Aware Compilation for {GP-GPUs}",
journal = j-TACO,
volume = "12",
number = "2",
pages = "24:1--24:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2778984",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "General-purpose graphic processing units (GP-GPUs)
offer high computational throughput using thousands of
integrated processing elements (PEs). These PEs are
stressed during workload execution, and negative bias
temperature instability (NBTI) adversely affects their
reliability by introducing new delay-induced faults.
However, the effect of these delay variations is not
uniformly spread across the PEs: some are affected more
--- hence less reliable --- than others. This variation
causes significant reduction in the lifetime of GP-GPU
parts. In this article, we address the problem of
``wear leveling'' across processing units to mitigate
lifetime uncertainty in GP-GPUs. We propose innovations
in the static compiled code that can improve healing in
PEs and stream cores (SCs) based on their degradation
status. PE healing is a fine-grained very long
instruction word (VLIW) slot assignment scheme that
balances the stress of instructions across the PEs
within an SC. SC healing is a coarse-grained workload
allocation scheme that distributes workload across SCs
in GP-GPUs. Both schemes share a common property: they
adaptively shift workload from less reliable units to
more reliable units, either spatially or temporally.
These software schemes are based on online calibration
with NBTI monitoring that equalizes the expected
lifetime of PEs and SCs by regenerating adaptive
compiled codes to respond to the specific health state
of the GP-GPUs. We evaluate the effectiveness of the
proposed schemes for various OpenCL kernels from the
AMD APP SDK on Evergreen and Southern Island GPU
architectures. The aging-aware healthy kernels
generated by the PE (or SC) healing scheme reduce
NBTI-induced voltage threshold shift by 30\% (77\% in
the case of SCs), with no (moderate) performance
penalty compared to the naive kernels.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Railing:2015:CEG,
author = "Brian P. Railing and Eric R. Hein and Thomas M.
Conte",
title = "{Contech}: Efficiently Generating Dynamic Task Graphs
for Arbitrary Parallel Programs",
journal = j-TACO,
volume = "12",
number = "2",
pages = "25:1--25:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2776893",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallel programs can be characterized by task graphs
encoding instructions, memory accesses, and the
parallel work's dependencies, while representing any
threading library and architecture. This article
presents Contech, a high performance framework for
generating dynamic task graphs from arbitrary parallel
programs, and a novel representation enabling
programmers and compiler optimizations to understand
and exploit program aspects. The Contech framework
supports a variety of languages (including C, C++, and
Fortran), parallelization libraries, and ISAs
(including x86 and ARM). Running natively for
collection speed and minimizing program perturbation,
the instrumentation shows $ 4 \times $ improvement over
a Pin-based implementation on PARSEC and NAS
benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Davari:2015:EGA,
author = "Mahdad Davari and Alberto Ros and Erik Hagersten and
Stefanos Kaxiras",
title = "The Effects of Granularity and Adaptivity on
Private\slash Shared Classification for Coherence",
journal = j-TACO,
volume = "12",
number = "3",
pages = "26:1--26:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2790301",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Classification of data into private and shared has
proven to be a catalyst for techniques to reduce
coherence cost, since private data can be taken out of
coherence and resources can be concentrated on
providing coherence for shared data. In this article,
we examine how granularity-page-level versus cache-line
level-and adaptivity-going from shared to
private-affect the outcome of classification and its
final impact on coherence. We create a classification
technique, called Generational Classification, and a
coherence protocol called Generational Coherence, which
treats data as private or shared based on cache-line
generations. We compare two coherence protocols based
on self-invalidation/self-downgrade with respect to
data classification. Our findings are enlightening: (i)
Some programs benefit from finer granularity, some
benefit further from adaptivity, but some do not
benefit from either. (ii) Reducing the amount of shared
data has no perceptible impact on coherence misses
caused by self-invalidation of shared data, hence no
impact on performance. (iii) In contrast, classifying
more data as private has implications for protocols
that employ write-through as a means of self-downgrade,
resulting in network traffic reduction-up to 30\%-by
reducing write-through traffic.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gottscho:2015:DDP,
author = "Mark Gottscho and Abbas BanaiyanMofrad and Nikil Dutt
and Alex Nicolau and Puneet Gupta",
title = "{DPCS}: Dynamic Power\slash Capacity Scaling for
{SRAM} Caches in the Nanoscale Era",
journal = j-TACO,
volume = "12",
number = "3",
pages = "27:1--27:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2792982",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Fault-Tolerant Voltage-Scalable (FTVS) SRAM cache
architectures are a promising approach to improve
energy efficiency of memories in the presence of
nanoscale process variation. Complex FTVS schemes are
commonly proposed to achieve very low minimum supply
voltages, but these can suffer from high overheads and
thus do not always offer the best power/capacity
trade-offs. We observe on our 45nm test chips that the
``fault inclusion property'' can enable lightweight
fault maps that support multiple runtime supply
voltages. Based on this observation, we propose a
simple and low-overhead FTVS cache architecture for
power/capacity scaling. Our mechanism combines
multilevel voltage scaling with optional architectural
support for power gating of blocks as they become
faulty at low voltages. A static (SPCS) policy sets the
runtime cache VDD once such that a only a few cache
blocks may be faulty in order to minimize the impact on
performance. We describe a Static Power/Capacity
Scaling (SPCS) policy and two alternate Dynamic
Power/Capacity Scaling (DPCS) policies that
opportunistically reduce the cache voltage even further
for more energy savings. This architecture achieves
lower static power for all effective cache capacities
than a recent more complex FTVS scheme. This is due to
significantly lower overheads, despite the inability of
our approach to match the min-VDD of the competing work
at a fixed target yield. Over a set of SPEC CPU2006
benchmarks on two system configurations, the average
total cache (system) energy saved by SPCS is 62\%
(22\%), while the two DPCS policies achieve roughly
similar energy reduction, around 79\% (26\%). On
average, the DPCS approaches incur 2.24\% performance
and 6\% area penalties.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Michaud:2015:RCM,
author = "Pierre Michaud and Andrea Mondelli and Andr{\'e}
Seznec",
title = "Revisiting Clustered Microarchitecture for Future
Superscalar Cores: a Case for Wide Issue Clusters",
journal = j-TACO,
volume = "12",
number = "3",
pages = "28:1--28:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2800787",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "During the past 10 years, the clock frequency of
high-end superscalar processors has not increased.
Performance keeps growing mainly by integrating more
cores on the same chip and by introducing new
instruction set extensions. However, this benefits only
some applications and requires rewriting and/or
recompiling these applications. A more general way to
accelerate applications is to increase the IPC, the
number of instructions executed per cycle. Although the
focus of academic microarchitecture research moved away
from IPC techniques, the IPC of commercial processors
was continuously improved during these years. We argue
that some of the benefits of technology scaling should
be used to raise the IPC of future superscalar cores
further. Starting from microarchitecture parameters
similar to recent commercial high-end cores, we show
that an effective way to increase the IPC is to allow
the out-of-order engine to issue more micro-ops per
cycle. But this must be done without impacting the
clock cycle. We propose combining two techniques:
clustering and register write specialization. Past
research on clustered microarchitectures focused on
narrow issue clusters, as the emphasis at that time was
on allowing high clock frequencies. Instead, in this
study, we consider wide issue clusters, with the goal
of increasing the IPC under a constant clock frequency.
We show that on a wide issue dual cluster, a very
simple steering policy that sends 64 consecutive
instructions to the same cluster, the next 64
instructions to the other cluster, and so forth,
permits tolerating an intercluster delay of three
cycles. We also propose a method for decreasing the
energy cost of sending results from one cluster to the
other cluster.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Natarajan:2015:LTE,
author = "Ragavendra Natarajan and Antonia Zhai",
title = "Leveraging Transactional Execution for Memory
Consistency Model Emulation",
journal = j-TACO,
volume = "12",
number = "3",
pages = "29:1--29:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2786980",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "System emulation is widely used in today's computer
systems. This technology opens new opportunities for
resource sharing as well as enhancing system security
and reliability. System emulation across different
instruction set architectures (ISA) can enable further
opportunities. For example, cross-ISA emulation can
enable workload consolidation over a wide range of
microprocessors and potentially facilitate the seamless
deployment of new processor architectures. As multicore
and manycore processors become pervasive, it is
important to address the challenges toward supporting
system emulation on these platforms. A key challenge in
cross-ISA emulation on multicore systems is ensuring
the correctness of emulation when the guest and the
host memory consistency models differ. Many existing
cross-ISA system emulators are sequential, thus they
are able to avoid this problem at the cost of
significant performance degradation. Recently proposed
parallel emulators are able to address the performance
limitation; however, they provide limited support for
memory consistency model emulation. When the host
system has a weaker memory consistency model compared
to the guest system, the emulator can insert memory
fences at appropriate locations in the translated code
to enforce the guest memory ordering constraints. These
memory fences can significantly degrade the performance
of the translated code. Transactional execution support
available on certain recent microprocessors provides an
alternative approach. Transactional execution of the
translated code enforces sequential consistency (SC) at
the coarse-grained transaction level, which in turn
ensures that all memory accesses made on the host
machine conform to SC. Enforcing SC on the host machine
guarantees that the emulated execution will be correct
for any guest memory model. In this article, we compare
and evaluate the overheads associated with using
transactions and fences for memory consistency model
emulation on the Intel Haswell processor. Our
experience of implementing these two approaches on a
state-of-the-art parallel emulator, COREMU,
demonstrates that memory consistency model emulation
using transactions performs better when the transaction
sizes are large enough to amortize the transaction
overhead and the transaction conflict rate is low,
whereas inserting memory fences is better for
applications in which the transaction overhead is high.
A hybrid implementation that dynamically determines
which approach to invoke can outperform both
approaches. Our results, based on the SPLASH-2 and the
PARSEC benchmark suites, demonstrate that the proposed
hybrid approach is able to outperform the fence
insertion mechanism by 4.9\% and the transactional
execution approach by 24.9\% for two-thread
applications, and outperform them by 4.5\% and 44.7\%,
respectively, for four-threaded execution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Panda:2015:CUD,
author = "Biswabandan Panda and Shankar Balachandran",
title = "{CAFFEINE}: a Utility-Driven Prefetcher Aggressiveness
Engine for Multicores",
journal = j-TACO,
volume = "12",
number = "3",
pages = "30:1--30:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2806891",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Aggressive prefetching improves system performance by
hiding and tolerating off-chip memory latency. However,
on a multicore system, prefetchers of different cores
contend for shared resources and aggressive prefetching
can degrade the overall system performance. The role of
a prefetcher aggressiveness engine is to select
appropriate aggressiveness levels for each prefetcher
such that shared resource contention caused by
prefetchers is reduced, thereby improving system
performance. State-of-the-art prefetcher aggressiveness
engines monitor metrics such as prefetch accuracy,
bandwidth consumption, and last-level cache pollution.
They use carefully tuned thresholds for these metrics,
and when the thresholds are crossed, they trigger
aggressiveness control measures. These engines have
three major shortcomings: (1) thresholds are dependent
on the system configuration (cache size, DRAM
scheduling policy, and cache replacement policy) and
have to be tuned appropriately, (2) there is no single
threshold that works well across all the workloads, and
(3) thresholds are oblivious to the phase change of
applications. To overcome these shortcomings, we
propose CAFFEINE, a model-based approach that analyzes
the effectiveness of a prefetcher and uses a metric
called net utility to control the aggressiveness. Our
metric provides net processor cycles saved because of
prefetching by approximating the cycles saved across
the memory subsystem, from last-level cache to DRAM. We
evaluate CAFFEINE across a wide range of workloads and
compare it with the state-of-the-art prefetcher
aggressiveness engine. Experimental results demonstrate
that, on average (geomean), CAFFEINE achieves 9.5\% (as
much as 38.29\%) and 11\% (as much as 20.7\%) better
performance than the best-performing aggressiveness
engine for four-core and eight-core systems,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2015:BSB,
author = "Jishen Zhao and Sheng Li and Jichuan Chang and John L.
Byrne and Laura L. Ramirez and Kevin Lim and Yuan Xie
and Paolo Faraboschi",
title = "{Buri}: Scaling Big-Memory Computing with
Hardware-Based Memory Expansion",
journal = j-TACO,
volume = "12",
number = "3",
pages = "31:1--31:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808233",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Motivated by the challenges of scaling up memory
capacity and fully exploiting the benefits of memory
compression, we propose Buri, a hardware-based memory
compression scheme, which simultaneously achieves cost
efficiency, high performance, and ease of adoption.
Buri combines (1) a self-contained, ready-to-adopt
hardware compression module, which manages metadata
compression and memory allocation/relocation
operations; (2) a set of hardware optimization
mechanisms, which reduce the area and performance
overheads in accommodating the address indirection
required by memory compression; and (3) lightweight
BIOS/OS extensions used to handle exceptions. Our
evaluation with large memory workload traces shows that
Buri can increase capacity by 70\%, in addition to the
compression ratio already provided by database
software.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lucas:2015:SSS,
author = "Jan Lucas and Michael Andersch and Mauricio
Alvarez-Mesa and Ben Juurlink",
title = "Spatiotemporal {SIMT} and Scalarization for Improving
{GPU} Efficiency",
journal = j-TACO,
volume = "12",
number = "3",
pages = "32:1--32:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2811402",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 7 18:51:05 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Temporal SIMT (TSIMT) has been suggested as an
alternative to conventional (spatial) SIMT for
improving GPU performance on branch-intensive code.
Although TSIMT has been briefly mentioned before, it
was not evaluated. We present a complete design and
evaluation of TSIMT GPUs, along with the inclusion of
scalarization and a combination of temporal and spatial
SIMT, named Spatiotemporal SIMT (STSIMT). Simulations
show that TSIMT alone results in a performance
reduction, but a combination of scalarization and
STSIMT yields a mean performance enhancement of 19.6\%
and improves the energy-delay product by 26.2\%
compared to SIMT.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Das:2016:RDB,
author = "Subhasis Das and Tor M. Aamodt and William J. Dally",
title = "Reuse Distance-Based Probabilistic Cache Replacement",
journal = j-TACO,
volume = "12",
number = "4",
pages = "33:1--33:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818374",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes Probabilistic Replacement Policy
(PRP), a novel replacement policy that evicts the line
with minimum estimated hit probability under optimal
replacement instead of the line with maximum expected
reuse distance. The latter is optimal under the
independent reference model of programs, which does not
hold for last-level caches (LLC). PRP requires 7\% and
2\% metadata overheads in the cache and DRAM
respectively. Using a sampling scheme makes DRAM
overhead negligible, with minimal performance impact.
Including detailed overhead modeling and equal cache
areas, PRP outperforms SHiP, a state-of-the-art LLC
replacement algorithm, by 4\% for memory-intensive
SPEC-CPU2006 benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Deniz:2016:MGM,
author = "Etem Deniz and Alper Sen",
title = "{MINIME-GPU}: Multicore Benchmark Synthesizer for
{GPUs}",
journal = j-TACO,
volume = "12",
number = "4",
pages = "34:1--34:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We introduce MINIME-GPU, a novel automated benchmark
synthesis framework for graphics processing units
(GPUs) that serves to speed up architectural simulation
of modern GPU architectures. Our framework captures
important characteristics of original GPU applications
and generates synthetic GPU benchmarks using the Open
Computing Language (OpenCL) library from those
applications. To the best of our knowledge, this is the
first time synthetic OpenCL benchmarks for GPUs are
generated from existing applications. We use several
characteristics, including instruction throughput,
compute unit occupancy, and memory efficiency, to
compare the similarity of original applications and
their corresponding synthetic benchmarks. The
experimental results show that our synthetic benchmark
generation framework is capable of generating synthetic
benchmarks that have similar characteristics with the
original applications from which they are generated. On
average, the similarity (accuracy) is 96\% and the
speedup is 541 $ \times $ . In addition, our synthetic
benchmarks use the OpenCL library, which allows us to
obtain portable human readable benchmarks as opposed to
using assembly-level code, and they are faster and
smaller than the original applications from which they
are generated. We experimentally validated that our
synthetic benchmarks preserve the characteristics of
the original applications across different
architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tan:2016:SEE,
author = "Li Tan and Zizhong Chen and Shuaiwen Leon Song",
title = "Scalable Energy Efficiency with Resilience for High
Performance Computing Systems: a Quantitative
Methodology",
journal = j-TACO,
volume = "12",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2822893",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Ever-growing performance of supercomputers nowadays
brings demanding requirements of energy efficiency and
resilience, due to rapidly expanding size and duration
in use of the large-scale computing systems. Many
application/architecture-dependent parameters that
determine energy efficiency and resilience individually
have causal effects with each other, which directly
affect the trade-offs among performance, energy
efficiency and resilience at scale. To enable
high-efficiency management for large-scale
High-Performance Computing (HPC) systems nowadays,
quantitatively understanding the entangled effects
among performance, energy efficiency, and resilience is
thus required. While previous work focuses on exploring
energy-saving and resilience-enhancing opportunities
separately, little has been done to theoretically and
empirically investigate the interplay between energy
efficiency and resilience at scale. In this article, by
extending the Amdahl's Law and the Karp-Flatt Metric,
taking resilience into consideration, we quantitatively
model the integrated energy efficiency in terms of
performance per Watt and showcase the trade-offs among
typical HPC parameters, such as number of cores,
frequency/voltage, and failure rates. Experimental
results for a wide spectrum of HPC benchmarks on two
HPC systems show that the proposed models are accurate
in extrapolating resilience-aware performance and
energy efficiency, and capable of capturing the
interplay among various energy-saving and resilience
factors. Moreover, the models can help find the optimal
HPC configuration for the highest integrated energy
efficiency, in the presence of failures and applied
resilience techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pusukuri:2016:TEL,
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
Bhuyan",
title = "{Tumbler}: an Effective Load-Balancing Technique for
Multi-{CPU} Multicore Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2827698",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Schedulers used by modern OSs (e.g., Oracle Solaris
11{\trademark} and GNU/Linux) balance load by balancing
the number of threads in run queues of different cores.
While this approach is effective for a single CPU
multicore system, we show that it can lead to a
significant load imbalance across CPUs of a multi-CPU
multicore system. Because different threads of a
multithreaded application often exhibit different
levels of CPU utilization, load cannot be measured in
terms of the number of threads alone. We propose
Tumbler that migrates the threads of a multithreaded
program across multiple CPUs to balance the load across
the CPUs. While Tumbler distributes the threads equally
across the CPUs, its assignment of threads to CPUs is
aimed at minimizing the variation in utilization of
different CPUs to achieve load balance. We evaluated
Tumbler using a wide variety of 35 multithreaded
applications, and our experimental results show that
Tumbler outperforms both Oracle Solaris 11{\trademark}
and GNU/Linux.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tomusk:2016:FME,
author = "Erik Tomusk and Christophe Dubach and Michael
O'Boyle",
title = "Four Metrics to Evaluate Heterogeneous Multicores",
journal = j-TACO,
volume = "12",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2829950",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Semiconductor device scaling has made single-ISA
heterogeneous processors a reality. Heterogeneous
processors contain a number of different CPU cores that
all implement the same Instruction Set Architecture
(ISA). This enables greater flexibility and
specialization, as runtime constraints and workload
characteristics can influence which core a given
workload is run on. A major roadblock to the further
development of heterogeneous processors is the lack of
appropriate evaluation metrics. Existing metrics can be
used to evaluate individual cores, but to evaluate a
heterogeneous processor, the cores must be considered
as a collective. Without appropriate metrics, it is
impossible to establish design goals for processors,
and it is difficult to accurately compare two different
heterogeneous processors. We present four new metrics
to evaluate user-oriented aspects of sets of
heterogeneous cores: localized nonuniformity, gap
overhead, set overhead, and generality. The metrics
consider sets rather than individual cores. We use
examples to demonstrate each metric, and show that the
metrics can be used to quantify intuitions about
heterogeneous cores.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hoseinzadeh:2016:SSP,
author = "Morteza Hoseinzadeh and Mohammad Arjomand and Hamid
Sarbazi-Azad",
title = "{SPCM}: The Striped Phase Change Memory",
journal = j-TACO,
volume = "12",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2829951",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase Change Memory (PCM) devices are one of the known
promising technologies to take the place of DRAM
devices with the aim of overcoming the obstacles of
reducing feature size and stopping ever growing amounts
of leakage power. In exchange for providing high
capacity, high density, and nonvolatility, PCM
Multilevel Cells (MLCs) impose high write energy and
long latency. Many techniques have been proposed to
resolve these side effects. However, read performance
issues are usually left behind the great importance of
write latency, energy, and lifetime. In this article,
we focus on read performance and improve the critical
path latency of the main memory system. To this end, we
exploit striping scheme by which multiple lines are
grouped and lie on a single MLC line array. In order to
achieve more performance gain, an adaptive ordering
mechanism is used to sort lines in a group based on
their read frequency. This scheme imposes large energy
and lifetime overheads due to its intensive demand for
higher write bandwidth. Thus, we equipped our design
with a grouping/pairing write queue to synchronize
write-back requests such that all updates to an MLC
array occur at once. The design is also augmented by a
directional write scheme that takes benefits of the
uniformity of accesses to the PCM device---caused by
the large DRAM cache---to determine the writing mode
(striped or nonstriped). This adaptation to write
operations relaxes the energy and lifetime overheads.
We improve the read latency of a 2-bit MLC PCM memory
by more than 24\% (and Instructions Per Cycle (IPC) by
about 9\%) and energy-delay product by about 20\% for a
small lifetime degradation of 8\%, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2016:TLH,
author = "Chuntao Jiang and Zhibin Yu and Lieven Eeckhout and
Hai Jin and Xiaofei Liao and Chengzhong Xu",
title = "Two-Level Hybrid Sampled Simulation of Multithreaded
Applications",
journal = j-TACO,
volume = "12",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818353",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Sampled microarchitectural simulation of
single-threaded applications is mature technology for
over a decade now. Sampling multithreaded applications,
on the other hand, is much more complicated. Not until
very recently have researchers proposed solutions for
sampled simulation of multithreaded applications.
Time-Based Sampling (TBS) samples multithreaded
application execution based on time---not instructions
as is typically done for single-threaded
applications---yielding estimates for a multithreaded
application's execution time. In this article, we
revisit and analyze previously proposed TBS approaches
(periodic and cantor fractal based sampling), and we
obtain a number of novel and surprising insights, such
as (i) accurately estimating fast-forwarding IPC, that
is, performance in-between sampling units, is more
important than accurately estimating sample IPC, that
is, performance within the sampling units; (ii)
fast-forwarding IPC estimation accuracy is determined
by both the sampling unit distribution and how to use
the sampling units to predict fast-forwarding IPC; and
(iii) cantor sampling is more accurate at small
sampling unit sizes, whereas periodic is more accurate
at large sampling unit sizes. These insights lead to
the development of Two-level Hybrid Sampling (THS), a
novel sampling methodology for multithreaded
applications that combines periodic sampling's accuracy
at large time scales (i.e., uniformly selecting
coarse-grain sampling units across the entire program
execution) with cantor sampling's accuracy at small
time scales (i.e., the ability to accurately predict
fast-forwarding IPC in-between small sampling units).
The clustered occurrence of small sampling units under
cantor sampling also enables shortened warmup and thus
enhanced simulation speed. Overall, THS achieves an
average absolute execution time prediction error of 4\%
while yielding an average simulation speedup of 40 $
\times $ compared to detailed simulation, which is both
more accurate and faster than the current
state-of-the-art. Case studies illustrate THS' ability
to accurately predict relative performance differences
across the design space.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dsouza:2016:IMS,
author = "Sandeep D'souza and Soumya J. and Santanu
Chattopadhyay",
title = "Integrated Mapping and Synthesis Techniques for
Network-on-Chip Topologies with Express Channels",
journal = j-TACO,
volume = "12",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2831233",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The addition of express channels to a traditional mesh
network-on-chip (NoC) has emerged as a viable solution
to solve the problem of high latency. In this article,
we address the problem of integrated mapping and
synthesis for express channel--based mesh NoC
topologies. An integer linear programming--based
formulation has been presented for the mapping problem
followed by a constructive heuristic for simultaneous
application mapping and synthesis for an express
channel--based NoC. The static and dynamic simulation
results indicate that the obtained mappings lead to
significant reduction in both average packet delay and
network energy consumption. The obtained synthesized
topologies were also found to be much more power
efficient compared to conventional express channel
topologies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chasapis:2016:PEI,
author = "Dimitrios Chasapis and Marc Casas and Miquel
Moret{\'o} and Raul Vidal and Eduard Ayguad{\'e} and
Jes{\'u}s Labarta and Mateo Valero",
title = "{PARSECSs}: Evaluating the Impact of Task Parallelism
in the {PARSEC} Benchmark Suite",
journal = j-TACO,
volume = "12",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2829952",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this work, we show how parallel applications can be
implemented efficiently using task parallelism. We also
evaluate the benefits of such parallel paradigm with
respect to other approaches. We use the PARSEC
benchmark suite as our test bed, which includes
applications representative of a wide range of domains
from HPC to desktop and server applications. We adopt
different parallelization techniques, tailored to the
needs of each application, to fully exploit the
task-based model. Our evaluation shows that task
parallelism achieves better performance than
thread-based parallelization models, such as Pthreads.
Our experimental results show that we can obtain
scalability improvements up to 42\% on a 16-core system
and code size reductions up to 81\%. Such reductions
are achieved by removing from the source code
application specific schedulers or thread pooling
systems and transferring these responsibilities to the
runtime system software.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gaspar:2016:FAG,
author = "Francisco Gaspar and Luis Tani{\c{c}}a and Pedro
Tom{\'a}s and Aleksandar Ilic and Leonel Sousa",
title = "A Framework for Application-Guided Task Management on
Heterogeneous Embedded Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2835177",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we propose a general framework for
fine-grain application-aware task management in
heterogeneous embedded platforms, which allows
integration of different mechanisms for an efficient
resource utilization, frequency scaling, and task
migration. The proposed framework incorporates several
components for accurate runtime monitoring by relying
on the OS facilities and performance self-reporting for
parallel and iterative applications. The framework
efficiency is experimentally evaluated on a real
hardware platform, where significant power and energy
savings are attained for SPEC CPU2006 and PARSEC
benchmarks, by guiding frequency scaling and
intercluster migrations according to the runtime
application behavior and predefined performance
targets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ardestani:2016:MMV,
author = "Ehsan K. Ardestani and Rafael Trapani Possignolo and
Jose Luis Briz and Jose Renau",
title = "Managing Mismatches in Voltage Stacking with
{CoreUnfolding}",
journal = j-TACO,
volume = "12",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2835178",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Five percent to 25\% of power could be wasted before
it is delivered to the computational resources on a
die, due to inefficiencies of voltage regulators and
resistive loss. The power delivery could benefit if, at
the same power, the delivered voltage increases and the
current decreases. This article presents CoreUnfolding,
a technique that leverages voltage Stacking to improve
power delivery efficiency. Our experiments show that
about 10\% system-wide power can be saved, the voltage
regulator area can be reduced by 30\%, di / dt improves
49\%, and the power pin count is reduced by 40\%
({\SGMLap} 20\% reduction in packaging costs), with
negligible performance degradation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nair:2016:FFC,
author = "Prashant J. Nair and David A. Roberts and Moinuddin K.
Qureshi",
title = "{FaultSim}: a Fast, Configurable Memory-Reliability
Simulator for Conventional and {$3$D}-Stacked Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2831234",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As memory systems scale, maintaining their Reliability
Availability and Serviceability (RAS) is becoming more
complex. To make matters worse, recent studies of DRAM
failures in data centers and supercomputer environments
have highlighted that large-granularity failures are
common in DRAM chips. Furthermore, the move toward
3D-stacked memories can make the system vulnerable to
newer failure modes, such as those occurring from
faults in Through-Silicon Vias (TSVs). To architect
future systems and to use emerging technology, system
designers will need to employ strong error correction
and repair techniques. Unfortunately, evaluating the
relative effectiveness of these reliability mechanisms
is often difficult and is traditionally done with
analytical models, which are both error prone and
time-consuming to develop. To this end, this article
proposes FaultSim, a fast configurable
memory-reliability simulation tool for 2D and
3D-stacked memory systems. FaultSim employs Monte Carlo
simulations, which are driven by real-world failure
statistics. We discuss the novel algorithms and data
structures used in FaultSim to accelerate the
evaluation of different resilience schemes. We
implement BCH-1 (SECDED) and ChipKill codes using
FaultSim and validate against an analytical model.
FaultSim implements BCH-1 and ChipKill codes with a
deviation of only 0.032\% and 8.41\% from the
analytical model. FaultSim can simulate 1 million Monte
Carlo trials (each for a period of 7 years) of BCH-1
and ChipKill codes in only 34 seconds and 33 seconds,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2016:ACS,
author = "Byeongcheol Lee",
title = "Adaptive Correction of Sampling Bias in Dynamic Call
Graphs",
journal = j-TACO,
volume = "12",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2840806",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article introduces a practical low-overhead
adaptive technique of correcting sampling bias in
profiling dynamic call graphs. Timer-based sampling
keeps the overhead low but sampling bias lowers the
accuracy when either observable call events or sampling
actions are not equally spaced in time. To mitigate
sampling bias, our adaptive correction technique
weights each sample by monitoring time-varying spacing
of call events and sampling actions. We implemented and
evaluated our adaptive correction technique in Jikes
RVM, a high-performance virtual machine. In our
empirical evaluation, our technique significantly
improved the sampling accuracy without measurable
overhead and resulted in effective feedback directed
inlining.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mcpherson:2016:FPL,
author = "Andrew J. Mcpherson and Vijay Nagarajan and Susmit
Sarkar and Marcelo Cintra",
title = "Fence Placement for Legacy Data-Race-Free Programs via
Synchronization Read Detection",
journal = j-TACO,
volume = "12",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2835179",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Shared-memory programmers traditionally assumed
Sequential Consistency (SC), but modern systems have
relaxed memory consistency. Here, the trend in
languages is toward Data-Race-Free (DRF) models, where,
assuming annotated synchronizations and the program
being well-synchronized by those synchronizations, the
hardware and compiler guarantee SC. However, legacy
programs lack annotations, so even well-synchronized
(legacy DRF) programs aren't recognized. For legacy DRF
programs, we can significantly prune the set of memory
orderings determined by automated fence placement by
automatically identifying synchronization reads. We
prove our rules for identifying them conservatively,
implement them within LLVM, and observe a 30\% average
performance improvement over previous techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hong:2016:OCT,
author = "Ding-Yong Hong and Chun-Chen Hsu and Cheng-Yi Chou and
Wei-Chung Hsu and Pangfeng Liu and Jan-Jan Wu",
title = "Optimizing Control Transfer and Memory Virtualization
in Full System Emulators",
journal = j-TACO,
volume = "12",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2837027",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Full system emulators provide virtual platforms for
several important applications, such as kernel and
system software development, co-verification with cycle
accurate CPU simulators, or application development for
hardware still in development. Full system emulators
usually use dynamic binary translation to obtain
reasonable performance. This paper focuses on
optimizing the performance of full system emulators.
First, we optimize performance by enabling classic
control transfer optimizations of dynamic binary
translation in full system emulation, such as indirect
branch target caching and block chaining. Second, we
improve the performance of memory virtualization of
cross-ISA virtual machines by improving the efficiency
of the software translation lookaside buffer (software
TLB). We implement our optimizations on QEMU, an
industrial-strength full system emulator, along with
the Android emulator. Experimental results show that
our optimizations achieve an average speedup of 1.98X
for ARM-to-X86-64 QEMU running SPEC CINT2006 benchmarks
with train inputs. Our optimizations also achieve an
average speedup of 1.44X and 1.40X for IA32-to-X86-64
QEMU and AArch64-to-X86-64 QEMU on SPEC CINT2006. We
use a set of real applications downloaded from Google
Play as benchmarks for the Android emulator.
Experimental results show that our optimizations
achieve an average speedup of 1.43X for the Android
emulator running these applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sukumaran-Rajam:2016:PMN,
author = "Aravind Sukumaran-Rajam and Philippe Clauss",
title = "The Polyhedral Model of Nonlinear Loops",
journal = j-TACO,
volume = "12",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2838734",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Runtime code optimization and speculative execution
are becoming increasingly prominent to leverage
performance in the current multi- and many-core era.
However, a wider and more efficient use of such
techniques is mainly hampered by the prohibitive time
overhead induced by centralized data race detection,
dynamic code behavior modeling, and code generation.
Most of the existing Thread Level Speculation (TLS)
systems rely on naively slicing the target loops into
chunks and trying to execute the chunks in parallel
with the help of a centralized performance-penalizing
verification module that takes care of data races. Due
to the lack of a data dependence model, these
speculative systems are not capable of doing advanced
transformations, and, more importantly, the chances of
rollback are high. The polyhedral model is a well-known
mathematical model to analyze and optimize loop nests.
The current state-of-art tools limit the application of
the polyhedral model to static control codes. Thus,
none of these tools can generally handle codes with
while loops, indirect memory accesses, or pointers.
Apollo (Automatic POLyhedral Loop Optimizer) is a
framework that goes one step beyond and applies the
polyhedral model dynamically by using TLS. Apollo can
predict, at runtime, whether the codes are behaving
linearly or not, and it applies polyhedral
transformations on-the-fly. This article presents a
novel system that enables Apollo to handle codes whose
memory accesses and loop bounds are not necessarily
linear. More generally, this approach expands the
applicability of the polyhedral model at runtime to a
wider class of codes. Plugging together both linear and
nonlinear accesses to the dependence prediction model
enables the application of polyhedral loop optimizing
transformations even for nonlinear code kernels while
also allowing a low-cost speculation verification.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nair:2016:CEP,
author = "Prashant J. Nair and David A. Roberts and Moinuddin K.
Qureshi",
title = "Citadel: Efficiently Protecting Stacked Memory from
{TSV} and Large Granularity Failures",
journal = j-TACO,
volume = "12",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2840807",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Stacked memory modules are likely to be tightly
integrated with the processor. It is vital that these
memory modules operate reliably, as memory failure can
require the replacement of the entire socket. To make
matters worse, stacked memory designs are susceptible
to newer failure modes (e.g., due to faulty
through-silicon vias, or TSVs) that can cause large
portions of memory, such as a bank, to become faulty.
To avoid data loss from large-granularity failures, the
memory system may use symbol-based codes that stripe
the data for a cache line across several banks (or
channels). Unfortunately, such data-striping reduces
memory-level parallelism, causing significant slowdown
and higher power consumption. This article proposes
Citadel, a robust memory architecture that allows the
memory system to retain each cache line within one
bank. By retaining cache lines within banks, Citadel
enables a high-performance and low-power memory system
and also efficiently protects the stacked memory system
from large-granularity failures. Citadel consists of
three components; TSV-Swap, which can tolerate both
faulty data-TSVs and faulty address-TSVs;
Tri-Dimensional Parity (3DP), which can tolerate column
failures, row failures, and bank failures; and Dynamic
Dual-Granularity Sparing (DDS), which can mitigate
permanent faults by dynamically sparing faulty memory
regions either at a row granularity or at a bank
granularity. Our evaluations with real-world data for
DRAM failures show that Citadel provides performance
and power similar to maintaining the entire cache line
in the same bank, and yet provides 700 $ \times $
higher reliability than ChipKill-like ECC codes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anderson:2016:AVI,
author = "Andrew Anderson and Avinash Malik and David Gregg",
title = "Automatic Vectorization of Interleaved Data
Revisited",
journal = j-TACO,
volume = "12",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2838735",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Automatically exploiting short vector instructions
sets (SSE, AVX, NEON) is a critically important task
for optimizing compilers. Vector instructions typically
work best on data that is contiguous in memory, and
operating on non-contiguous data requires additional
work to gather and scatter the data. There are several
varieties of non-contiguous access, including
interleaved data access. An existing approach used by
GCC generates extremely efficient code for loops with
power-of-2 interleaving factors (strides). In this
paper we propose a generalization of this approach that
produces similar code for any compile-time constant
interleaving factor. In addition, we propose several
novel program transformations, which were made possible
by our generalized representation of the problem.
Experiments show that our approach achieves significant
speedups for both power-of-2 and non--power-of-2
interleaving factors. Our vectorization approach
results in mean speedups over scalar code of 1.77x on
Intel SSE and 2.53x on Intel AVX2 in real-world
benchmarking on a selection of BLAS Level 1 routines.
On the same benchmark programs, GCC 5.0 achieves mean
improvements of 1.43x on Intel SSE and 1.30x on Intel
AVX2. In synthetic benchmarking on Intel SSE, our
maximum improvement on data movement is over 4x for
gathering operations and over 6x for scattering
operations versus scalar code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2016:FMR,
author = "Lihang Zhao and Lizhong Chen and Woojin Choi and
Jeffrey Draper",
title = "A Filtering Mechanism to Reduce Network Bandwidth
Utilization of Transaction Execution",
journal = j-TACO,
volume = "12",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2837028",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware Transactional Memory (HTM) relies heavily on
the on-chip network for intertransaction communication.
However, the network bandwidth utilization of
transactions has been largely neglected in HTM designs.
In this work, we propose a cost model to analyze
network bandwidth in transaction execution. The cost
model identifies a set of key factors that can be
optimized through system design to reduce the
communication cost of HTM. Based on the model and
network traffic characterization of a representative
HTM design, we identify a huge source of superfluous
traffic due to failed requests in transaction
conflicts. As observed in a spectrum of workloads, 39\%
of the transactional requests fail due to conflicts,
which renders 58\% of the transactional network traffic
futile. To combat this pathology, a novel in-network
filtering mechanism is proposed. The on-chip router is
augmented to predict conflicts among transactions and
proactively filter out those requests that have a high
probability to fail. Experimental results show the
proposed mechanism reduces total network traffic by
24\% on average for a set of high-contention TM
applications, thereby reducing energy consumption by an
average of 24\%. Meanwhile, the contention in the
coherence directory is reduced by 68\%, on average.
These improvements are achieved with only 5\% area
added to a conventional on-chip router design.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Serres:2016:EPP,
author = "Olivier Serres and Abdullah Kayi and Ahmad Anbar and
Tarek El-Ghazawi",
title = "Enabling {PGAS} Productivity with Hardware Support for
Shared Address Mapping: a {UPC} Case Study",
journal = j-TACO,
volume = "12",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842686",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Due to its rich memory model, the partitioned global
address space (PGAS) parallel programming model strikes
a balance between locality-awareness and the ease of
use of the global address space model. Although
locality-awareness can lead to high performance,
supporting the PGAS memory model is associated with
penalties that can hinder PGAS's potential for
scalability and speed of execution. This is because
mapping the PGAS memory model to the underlying system
requires a mapping process that is done in software,
thereby introducing substantial overhead for shared
accesses even when they are local. Compiler
optimizations have not been sufficient to offset this
overhead. On the other hand, manual code optimizations
can help, but this eliminates the productivity edge of
PGAS. This article proposes a processor
microarchitecture extension that can perform such
address mapping in hardware with nearly no performance
overhead. These extensions are then availed to
compilers through extensions to the processor
instructions. Thus, the need for manual optimizations
is eliminated and the productivity of PGAS languages is
unleashed. Using Unified Parallel C (UPC), a PGAS
language, we present a case study of a prototype
compiler and architecture support. Two different
implementations of the system were realized. The first
uses a full-system simulator, gem5, which evaluates the
overall performance gain of the new hardware support.
The second uses an FPGA Leon3 soft-core processor to
verify implementation feasibility and to parameterize
the cost of the new hardware. The new instructions show
promising results on all tested codes, including the
NAS Parallel Benchmark kernels in UPC. Performance
improvements of up to 5.5 $ \times $ for unmodified
codes, sometimes surpassing hand-optimized performance,
were demonstrated. We also show that our four-core FPGA
prototype requires less than 2.4\% of the overall
chip's area.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cattaneo:2016:HAI,
author = "Riccardo Cattaneo and Giuseppe Natale and Carlo
Sicignano and Donatella Sciuto and Marco Domenico
Santambrogio",
title = "On How to Accelerate Iterative Stencil Loops: a
Scalable Streaming-Based Approach",
journal = j-TACO,
volume = "12",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842615",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In high-performance systems, stencil computations play
a crucial role as they appear in a variety of different
fields of application, ranging from partial
differential equation solving, to computer simulation
of particles' interaction, to image processing and
computer vision. The computationally intensive nature
of those algorithms created the need for solutions to
efficiently implement them in order to save both
execution time and energy. This, in combination with
their regular structure, has justified their widespread
study and the proposal of largely different approaches
to their optimization. However, most of these works are
focused on aggressive compile time optimization, cache
locality optimization, and parallelism extraction for
the multicore/multiprocessor domain, while fewer works
are focused on the exploitation of custom architectures
to further exploit the regular structure of Iterative
Stencil Loops (ISLs), specifically with the goal of
improving power efficiency. This work introduces a
methodology to systematically design power-efficient
hardware accelerators for the optimal execution of ISL
algorithms on Field-programmable Gate Arrays (FPGAs).
As part of the methodology, we introduce the notion of
Streaming Stencil Time-step (SST), a streaming-based
architecture capable of achieving both low resource
usage and efficient data reuse thanks to an optimal
data buffering strategy, and we introduce a technique
called SSTs queuing that is capable of delivering a
pseudolinear execution time speedup with constant
bandwidth. The methodology has been validated on
significant benchmarks on a Virtex-7 FPGA using the
Xilinx Vivado suite. Results demonstrate how the
efficient usage of the on-chip memory resources
realized by an SST allows one to treat problem sizes
whose implementation would otherwise not be possible
via direct synthesis of the original, unmanipulated
code via High-Level Synthesis (HLS). We also show how
the SSTs queuing effectively ensures a pseudolinear
throughput speedup while consuming constant off-chip
bandwidth.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{C:2016:FGM,
author = "Unnikrishnan C and Rupesh Nasre and Y. N. Srikant",
title = "{Falcon}: a Graph Manipulation Language for
Heterogeneous Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842618",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graph algorithms have been shown to possess enough
parallelism to keep several computing resources
busy---even hundreds of cores on a GPU. Unfortunately,
tuning their implementation for efficient execution on
a particular hardware configuration of heterogeneous
systems consisting of multicore CPUs and GPUs is
challenging, time consuming, and error prone. To
address these issues, we propose a domain-specific
language (DSL), Falcon, for implementing graph
algorithms that (i) abstracts the hardware, (ii)
provides constructs to write explicitly parallel
programs at a higher level, and (iii) can work with
general algorithms that may change the graph structure
(morph algorithms). We illustrate the usage of our DSL
to implement local computation algorithms (that do not
change the graph structure) and morph algorithms such
as Delaunay mesh refinement, survey propagation, and
dynamic SSSP on GPU and multicore CPUs. Using a set of
benchmark graphs, we illustrate that the generated code
performs close to the state-of-the-art hand-tuned
implementations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
remark = "Yes, the first author name is correct as given:
Unnikrishnan C.",
}
@Article{Kalayappan:2016:FRT,
author = "Rajshekar Kalayappan and Smruti R. Sarangi",
title = "{FluidCheck}: a Redundant Threading-Based Approach for
Reliable Execution in Manycore Processors",
journal = j-TACO,
volume = "12",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842620",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Soft errors have become a serious cause of concern
with reducing feature sizes. The ability to accommodate
complex, Simultaneous Multithreading (SMT) cores on a
single chip presents a unique opportunity to achieve
reliable execution, safe from soft errors, with low
performance penalties. In this context, we present
FluidCheck, a checker architecture that allows highly
flexible assignment and migration of checking duties
across cores. In this article, we present a mechanism
to dynamically use the resources of SMT cores for
checking the results of other threads, and propose a
variety of heuristics for migration of such checker
threads across cores. Secondly, to make the process of
checking more efficient, we propose a set of
architectural enhancements that reduce power
consumption, decrease the length of the critical path,
and reduce the load on the Network-on-Chip (NoC). Based
on our observations, we design a 16 core system for
running SPEC2006 based bag-of-tasks applications. Our
experiments demonstrate that fully reliable execution
can be attained with a mere 27\% slowdown, surpassing
traditional redundant threading based techniques by
roughly 42\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Elwell:2016:RMP,
author = "Jesse Elwell and Ryan Riley and Nael Abu-Ghazaleh and
Dmitry Ponomarev and Iliano Cervesato",
title = "Rethinking Memory Permissions for Protection Against
Cross-Layer Attacks",
journal = j-TACO,
volume = "12",
number = "4",
pages = "56:1--56:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842621",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The inclusive permissions structure (e.g., the Intel
ring model) of modern commodity CPUs provides
privileged system software layers with arbitrary
permissions to access and modify client processes,
allowing them to manage these clients and the system
resources efficiently. Unfortunately, these inclusive
permissions allow a compromised high-privileged
software layer to perform arbitrary malicious
activities. In this article, our goal is to prevent
attacks that cross system layers while maintaining the
abilities of system software to manage the system and
allocate resources. In particular, we present a
hardware-supported page permission framework for
physical pages that is based on the concept of
noninclusive sets of memory permissions for different
layers of system software (such as hypervisors,
operating systems, and user-level applications).
Instead of viewing privilege levels as an ordered
hierarchy with each successive level being more
privileged, we view them as distinct levels each with
its own set of permissions. In order to enable system
software to manage client processes, we define a set of
legal permission transitions that support resource
allocation but preserve security. We show that the
model prevents a range of recent attacks. We also show
that it can be implemented with negligible performance
overhead (both at load time and at runtime), low
hardware complexity, and minimal changes to the
commodity OS and hypervisor code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Morad:2016:RGS,
author = "Amir Morad and Leonid Yavits and Shahar Kvatinsky and
Ran Ginosar",
title = "Resistive {GP-SIMD} Processing-In-Memory",
journal = j-TACO,
volume = "12",
number = "4",
pages = "57:1--57:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2845084",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GP-SIMD, a novel hybrid general-purpose SIMD
architecture, addresses the challenge of data
synchronization by in-memory computing, through
combining data storage and massive parallel processing.
In this article, we explore a resistive implementation
of the GP-SIMD architecture. In resistive GP-SIMD, a
novel resistive row and column addressable 4F$^2$
crossbar is utilized, replacing the modified CMOS
190F$^2$ SRAM storage previously proposed for GP-SIMD
architecture. The use of the resistive crossbar allows
scaling the GP-SIMD from few millions to few hundred
millions of processing units on a single silicon die.
The performance, power consumption and power efficiency
of a resistive GP-SIMD are compared with the CMOS
version. We find that PiM architectures and,
specifically, GP-SIMD benefit more than other many-core
architectures from using resistive memory. A framework
for in-place arithmetic operation on a single
multivalued resistive cell is explored, demonstrating a
potential to become a building block for
next-generation PiM architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2016:IIB,
author = "Yaohua Wang and Dong Wang and Shuming Chen and Zonglin
Liu and Shenggang Chen and Xiaowen Chen and Xu Zhou",
title = "Iteration Interleaving--Based {SIMD} Lane Partition",
journal = j-TACO,
volume = "12",
number = "4",
pages = "58:1--58:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2847253",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The efficacy of single instruction, multiple data
(SIMD) architectures is limited when handling divergent
control flows. This circumstance results in SIMD
fragments using only a subset of the available lanes.
We propose an iteration interleaving--based SIMD lane
partition (IISLP) architecture that interleaves the
execution of consecutive iterations and dynamically
partitions SIMD lanes into branch paths with comparable
execution time. The benefits are twofold: SIMD
fragments under divergent branches can execute in
parallel, and the pathology of fragment starvation can
also be well eliminated. Our experiments show that
IISLP doubles the performance of a baseline mechanism
and provides a speedup of 28\% versus instruction
shuffle.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Aijo:2016:ILP,
author = "Tomi {\"A}ij{\"o} and Pekka J{\"a}{\"a}skel{\"a}inen
and Tapio Elomaa and Heikki Kultala and Jarmo Takala",
title = "Integer Linear Programming-Based Scheduling for
Transport Triggered Architectures",
journal = j-TACO,
volume = "12",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2845082",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Static multi-issue machines, such as traditional Very
Long Instructional Word (VLIW) architectures, move
complexity from the hardware to the compiler. This is
motivated by the ability to support high degrees of
instruction-level parallelism without requiring
complicated scheduling logic in the processor hardware.
The simpler-control hardware results in reduced area
and power consumption, but leads to a challenge of
engineering a compiler with good code-generation
quality. Transport triggered architectures (TTA), and
other so-called exposed datapath architectures, take
the compiler-oriented philosophy even further by
pushing more details of the datapath under software
control. The main benefit of this is the reduced
register file pressure, with a drawback of adding even
more complexity to the compiler side. In this article,
we propose an Integer Linear Programming (ILP) -based
instruction scheduling model for TTAs. The model
describes the architecture characteristics, the
particular processor resource constraints, and the
operation dependencies of the scheduled program. The
model is validated and measured by compiling
application kernels to various TTAs with a different
number of datapath components and connectivity. In the
best case, the cycle count is reduced to 52\% when
compared to a heuristic scheduler. In addition to
producing shorter schedules, the number of register
accesses in the compiled programs is generally notably
less than those with the heuristic scheduler; in the
best case, the ILP scheduler reduced the number of
register file reads to 33\% of the heuristic results
and register file writes to 18\%. On the other hand, as
expected, the ILP-based scheduler uses distinctly more
time to produce a schedule than the heuristic
scheduler, but the compilation time is within tolerable
limits for production-code generation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2016:SEA,
author = "Qixiao Liu and Miquel Moreto and Jaume Abella and
Francisco J. Cazorla and Daniel A. Jimenez and Mateo
Valero",
title = "Sensible Energy Accounting with Abstract Metering for
Multicore Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842616",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Chip multicore processors (CMPs) are the preferred
processing platform across different domains such as
data centers, real-time systems, and mobile devices. In
all those domains, energy is arguably the most
expensive resource in a computing system. Accurately
quantifying energy usage in a multicore environment
presents a challenge as well as an opportunity for
optimization. Standard metering approaches are not
capable of delivering consistent results with shared
resources, since the same task with the same inputs may
have different energy consumption based on the mix of
co-running tasks. However, it is reasonable for
data-center operators to charge on the basis of
estimated energy usage rather than time since energy is
more correlated with their actual cost. This article
introduces the concept of Sensible Energy Accounting
(SEA). For a task running in a multicore system, SEA
accurately estimates the energy the task would have
consumed running in isolation with a given fraction of
the CMP shared resources. We explain the potential
benefits of SEA in different domains and describe two
hardware techniques to implement it for a shared
last-level cache and on-core resources in SMT
processors. Moreover, with SEA, an energy-aware
scheduler can find a highly efficient on-chip resource
assignment, reducing by up to 39\% the total processor
energy for a 4-core system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2016:SAC,
author = "Miao Zhou and Yu Du and Bruce Childers and Daniel
Mosse and Rami Melhem",
title = "Symmetry-Agnostic Coordinated Management of the Memory
Hierarchy in Multicore Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "61:1--61:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2847254",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In a multicore system, many applications share the
last-level cache (LLC) and memory bandwidth. These
resources need to be carefully managed in a coordinated
way to maximize performance. DRAM is still the
technology of choice in most systems. However, as
traditional DRAM technology faces energy, reliability,
and scalability challenges, nonvolatile memory (NVM)
technologies are gaining traction. While DRAM is
read/write symmetric (a read operation has comparable
latency and energy consumption as a write operation),
many NVM technologies (such as Phase-Change Memory,
PCM) experience read/write asymmetry: write operations
are typically much slower and more power hungry than
read operations. Whether the memory's characteristics
are symmetric or asymmetric influences the way shared
resources are managed. We propose two symmetry-agnostic
schemes to manage a shared LLC through way partitioning
and memory through bandwidth allocation. The proposals
work well for both symmetric and asymmetric memory.
First, an exhaustive search is proposed to find the
best combination of a cache way partition and bandwidth
allocation. Second, an approximate scheme, derived from
a theoretical model, is proposed without the overhead
of exhaustive search. Simulation results show that the
approximate scheme improves weighted speedup by at
least 14\% on average (regardless of the memory
symmetry) over a state-of-the-art way partitioning and
memory bandwidth allocation. Simulation results also
show that the approximate scheme achieves comparable
weighted speedup as a state-of-the-art multiple
resource management scheme, XChange, for symmetric
memory, and outperforms it by an average of 10\% for
asymmetric memory.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yazdanbakhsh:2016:RRF,
author = "Amir Yazdanbakhsh and Gennady Pekhimenko and Bradley
Thwaites and Hadi Esmaeilzadeh and Onur Mutlu and Todd
C. Mowry",
title = "{RFVP}: Rollback-Free Value Prediction with
Safe-to-Approximate Loads",
journal = j-TACO,
volume = "12",
number = "4",
pages = "62:1--62:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2836168",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article aims to tackle two fundamental memory
bottlenecks: limited off-chip bandwidth (bandwidth
wall) and long access latency (memory wall). To achieve
this goal, our approach exploits the inherent error
resilience of a wide range of applications. We
introduce an approximation technique, called
Rollback-Free Value Prediction (RFVP). When certain
safe-to-approximate load operations miss in the cache,
RFVP predicts the requested values. However, RFVP does
not check for or recover from load-value
mispredictions, hence, avoiding the high cost of
pipeline flushes and re-executions. RFVP mitigates the
memory wall by enabling the execution to continue
without stalling for long-latency memory accesses. To
mitigate the bandwidth wall, RFVP drops a fraction of
load requests that miss in the cache after predicting
their values. Dropping requests reduces memory
bandwidth contention by removing them from the system.
The drop rate is a knob to control the trade-off
between performance/energy efficiency and output
quality. Our extensive evaluations show that RFVP, when
used in GPUs, yields significant performance
improvement and energy reduction for a wide range of
quality-loss levels. We also evaluate RFVP's latency
benefits for a single core CPU. The results show
performance improvement and energy reduction for a wide
variety of applications with less than 1\% loss in
quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2016:SML,
author = "Donghyuk Lee and Saugata Ghose and Gennady Pekhimenko
and Samira Khan and Onur Mutlu",
title = "Simultaneous Multi-Layer Access: Improving
{$3$D}-Stacked Memory Bandwidth at Low Cost",
journal = j-TACO,
volume = "12",
number = "4",
pages = "63:1--63:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2832911",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "3D-stacked DRAM alleviates the limited memory
bandwidth bottleneck that exists in modern systems by
leveraging through silicon vias (TSVs) to deliver
higher external memory channel bandwidth. Today's
systems, however, cannot fully utilize the higher
bandwidth offered by TSVs, due to the limited internal
bandwidth within each layer of the 3D-stacked DRAM. We
identify that the bottleneck to enabling higher
bandwidth in 3D-stacked DRAM is now the global bitline
interface, the connection between the DRAM row buffer
and the peripheral IO circuits. The global bitline
interface consists of a limited and expensive set of
wires and structures, called global bitlines and global
sense amplifiers, whose high cost makes it difficult to
simply scale up the bandwidth of the interface within a
single DRAM layer in the 3D stack. We alleviate this
bandwidth bottleneck by exploiting the observation that
several global bitline interfaces already exist across
the multiple DRAM layers in current 3D-stacked designs,
but only a fraction of them are enabled at the same
time. We propose a new 3D-stacked DRAM architecture,
called Simultaneous Multi-Layer Access (SMLA), which
increases the internal DRAM bandwidth by accessing
multiple DRAM layers concurrently, thus making much
greater use of the bandwidth that the TSVs offer. To
avoid channel contention, the DRAM layers must
coordinate with each other when simultaneously
transferring data. We propose two approaches to
coordination, both of which deliver four times the
bandwidth for a four-layer DRAM, over a baseline that
accesses only one layer at a time. Our first approach,
Dedicated-IO, statically partitions the TSVs by
assigning each layer to a dedicated set of TSVs that
operate at a higher frequency. Unfortunately,
Dedicated-IO requires a nonuniform design for each
layer (increasing manufacturing costs), and its DRAM
energy consumption scales linearly with the number of
layers. Our second approach, Cascaded-IO, solves both
issues by instead time multiplexing all of the TSVs
across layers. Cascaded-IO reduces DRAM energy
consumption by lowering the operating frequency of
higher layers. Our evaluations show that SMLA provides
significant performance improvement and energy
reduction across a variety of workloads (55\%/18\% on
average for multiprogrammed workloads, respectively)
over a baseline 3D-stacked DRAM, with low overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Na:2016:JPC,
author = "Yeoul Na and Seon Wook Kim and Youngsun Han",
title = "{JavaScript} Parallelizing Compiler for Exploiting
Parallelism from Data-Parallel {HTML5} Applications",
journal = j-TACO,
volume = "12",
number = "4",
pages = "64:1--64:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2846098",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of the HTML5 standard, JavaScript is
increasingly processing computationally intensive,
data-parallel workloads. Thus, the enhancement of
JavaScript performance has been emphasized because the
performance gap between JavaScript and native
applications is still substantial. Despite this
urgency, conventional JavaScript compilers do not
exploit much of parallelism even from data-parallel
JavaScript applications, despite contemporary mobile
devices being equipped with expensive parallel hardware
platforms, such as multicore processors and GPGPUs. In
this article, we propose an automatically parallelizing
JavaScript compiler that targets emerging,
data-parallel HTML5 applications by leveraging the
mature affine loop analysis of conventional static
compilers. We identify that the most critical issues
when parallelizing JavaScript with a conventional
static analysis are ensuring correct parallelization,
minimizing compilation overhead, and conducting
low-cost recovery when there is a speculation failure
during parallel execution. We propose a mechanism for
safely handling the failure at a low cost, based on
compiler techniques and the property of idempotence.
Our experiment shows that the proposed JavaScript
parallelizing compiler detects most affine parallel
loops. Also, we achieved a maximum speedup of 3.22
times on a quad-core system, while incurring negligible
compilation and recovery overheads with various sets of
data-parallel HTML5 applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Usui:2016:DDA,
author = "Hiroyuki Usui and Lavanya Subramanian and Kevin
Kai-Wei Chang and Onur Mutlu",
title = "{DASH}: Deadline-Aware High-Performance Memory
Scheduler for Heterogeneous Systems with Hardware
Accelerators",
journal = j-TACO,
volume = "12",
number = "4",
pages = "65:1--65:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2847255",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern SoCs integrate multiple CPU cores and hardware
accelerators (HWAs) that share the same main memory
system, causing interference among memory requests from
different agents. The result of this interference, if
it is not controlled well, is missed deadlines for HWAs
and low CPU performance. Few previous works have
tackled this problem. State-of-the-art mechanisms
designed for CPU-GPU systems strive to meet a target
frame rate for GPUs by prioritizing the GPU close to
the time when it has to complete a frame. We observe
two major problems when such an approach is adapted to
a heterogeneous CPU-HWA system. First, HWAs miss
deadlines because they are prioritized only when close
to their deadlines. Second, such an approach does not
consider the diverse memory access characteristics of
different applications running on CPUs and HWAs,
leading to low performance for latency-sensitive CPU
applications and deadline misses for some HWAs,
including GPUs. In this article, we propose a
Deadline-Aware memory Scheduler for Heterogeneous
systems (DASH), which overcomes these problems using
three key ideas, with the goal of meeting HWAs'
deadlines while providing high CPU performance. First,
DASH prioritizes an HWA when it is not on track to meet
its deadline any time during a deadline period, instead
of prioritizing it only when close to a deadline.
Second, DASH prioritizes HWAs over memory-intensive CPU
applications based on the observation that
memory-intensive applications' performance is not
sensitive to memory latency. Third, DASH treats
short-deadline HWAs differently as they are more likely
to miss their deadlines and schedules their requests
based on worst-case memory access time estimates.
Extensive evaluations across a wide variety of
different workloads and systems show that DASH achieves
significantly better CPU performance than the best
previous scheduler while always meeting the deadlines
for all HWAs, including GPUs, thereby largely improving
frame rates.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "65",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kafshdooz:2016:CTO,
author = "Morteza Mohajjel Kafshdooz and Mohammadkazem Taram and
Sepehr Assadi and Alireza Ejlali",
title = "A Compile-Time Optimization Method for {WCET}
Reduction in Real-Time Embedded Systems through Block
Formation",
journal = j-TACO,
volume = "12",
number = "4",
pages = "66:1--66:25",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2845083",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compile-time optimizations play an important role in
the efficient design of real-time embedded systems.
Usually, compile-time optimizations are designed to
reduce average-case execution time (ACET). While ACET
is a main concern in high-performance computing
systems, in real-time embedded systems, concerns are
different and worst-case execution time (WCET) is much
more important than ACET. Therefore, WCET reduction is
more desirable than ACET reduction in many real-time
embedded systems. In this article, we propose a
compile-time optimization method aimed at reducing WCET
in real-time embedded systems. In the proposed method,
based on the predicated execution capability of
embedded processors, program code blocks that are in
the worst-case paths of the program are merged to
increase instruction-level parallelism and opportunity
for WCET reduction. The use of predicated execution
enables merging code blocks from different worst-case
paths that can be very effective in WCET reduction. The
experimental results show that the proposed method can
reduce WCET by up to 45\% as compared to previous
compile-time block formation methods. It is noteworthy
that compared to previous works, while the proposed
method usually achieves more WCET reduction, it has
considerably less negative impact on ACET and code
size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "66",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Koukos:2016:BHU,
author = "Konstantinos Koukos and Alberto Ros and Erik Hagersten
and Stefanos Kaxiras",
title = "Building Heterogeneous {Unified Virtual Memories
(UVMs)} without the Overhead",
journal = j-TACO,
volume = "13",
number = "1",
pages = "1:1--1:22",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2889488",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This work proposes a novel scheme to facilitate
heterogeneous systems with unified virtual memory.
Research proposals implement coherence protocols for
sequential consistency (SC) between central processing
unit (CPU) cores and between devices. Such mechanisms
introduce severe bottlenecks in the system; therefore,
we adopt the heterogeneous-race-free (HRF) memory
model. The use of HRF simplifies the coherency protocol
and the graphics processing unit (GPU) memory
management unit (MMU). Our protocol optimizes CPU and
GPU demands separately, with the GPU part being simpler
while the CPU is more elaborate and latency aware. We
achieve an average 45\% speedup and 45\% electronic
data processing reduction (20\% energy) over the
corresponding SC implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2016:DMB,
author = "Zhigang Wang and Xiaolin Wang and Fang Hou and Yingwei
Luo and Zhenlin Wang",
title = "Dynamic Memory Balancing for Virtualization",
journal = j-TACO,
volume = "13",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2851501",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Allocating memory dynamically for virtual machines
(VMs) according to their demands provides significant
benefits as well as great challenges. Efficient memory
resource management requires knowledge of the memory
demands of applications or systems at runtime. A widely
proposed approach is to construct a miss ratio curve
(MRC) for a VM, which not only summarizes the current
working set size (WSS) of the VM but also models the
relationship between its performance and the target
memory allocation size. Unfortunately, the cost of
monitoring and maintaining the MRC structures is
nontrivial. This article first introduces a low-cost
WSS tracking system with effective optimizations on
data structures, as well as an efficient mechanism to
decrease the frequency of monitoring. We also propose a
Memory Balancer (MEB), which dynamically reallocates
guest memory based on the predicted WSS. Our
experimental results show that our prediction schemes
yield a high accuracy of 95.2\% and low overhead of
2\%. Furthermore, the overall system throughput can be
significantly improved with MEB, which brings a speedup
up to 7.4 for two to four VMs and 4.54 for an
overcommitted system with 16 VMs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2016:HPC,
author = "Xueyang Wang and Sek Chai and Michael Isnardi and
Sehoon Lim and Ramesh Karri",
title = "Hardware Performance Counter-Based Malware
Identification and Detection with Adaptive Compressive
Sensing",
journal = j-TACO,
volume = "13",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2857055",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware Performance Counter-based (HPC) runtime
checking is an effective way to identify malicious
behaviors of malware and detect malicious modifications
to a legitimate program's control flow. To reduce the
overhead in the monitored system which has limited
storage and computing resources, we present a
``sample-locally-analyze-remotely'' technique. The
sampled HPC data are sent to a remote server for
further analysis. To minimize the I/O bandwidth
required for transmission, the fine-grained HPC
profiles are compressed into much smaller vectors with
Compressive Sensing. The experimental results
demonstrate an 80\% I/O bandwidth reduction after
applying Compressive Sensing, without compromising the
detection and identification capabilities.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Akram:2016:BPG,
author = "Shoaib Akram and Jennifer B. Sartor and Kenzo {Van
Craeynest} and Wim Heirman and Lieven Eeckhout",
title = "Boosting the Priority of Garbage: Scheduling
Collection on Heterogeneous Multicore Processors",
journal = j-TACO,
volume = "13",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2875424",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "While hardware is evolving toward heterogeneous
multicore architectures, modern software applications
are increasingly written in managed languages.
Heterogeneity was born of a need to improve energy
efficiency; however, we want the performance of our
applications not to suffer from limited resources. How
best to schedule managed language applications on a mix
of big, out-of-order cores and small, in-order cores is
an open question, complicated by the host of service
threads that perform key tasks such as memory
management. These service threads compete with the
application for core and memory resources, and garbage
collection (GC) must sometimes suspend the application
if there is not enough memory available for allocation.
In this article, we explore concurrent garbage
collection's behavior, particularly when it becomes
critical, and how to schedule it on a heterogeneous
system to optimize application performance. While some
applications see no difference in performance when GC
threads are run on big versus small cores, others ---
those with GC criticality --- see up to an 18\%
performance improvement. We develop a new, adaptive
scheduling algorithm that responds to GC criticality
signals from the managed runtime, giving more big-core
cycles to the concurrent collector when it is under
pressure and in danger of suspending the application.
Our experimental results show that our
GC-criticality-aware scheduler is robust across a range
of heterogeneous architectures with different core
counts and frequency scaling and across heap sizes. Our
algorithm is performance and energy neutral for
GC-uncritical Java applications and significantly
speeds up GC-critical applications by 16\%, on average,
while being 20\% more energy efficient for a
heterogeneous multicore with three big cores and one
small core.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yilmaz:2016:ARS,
author = "Buse Yilmaz and Baris Aktemur and Mar{\'\i}A J.
Garzar{\'a}n and Sam Kamin and Furkan Kira{\c{c}}",
title = "Autotuning Runtime Specialization for Sparse
Matrix-Vector Multiplication",
journal = j-TACO,
volume = "13",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2851500",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Runtime specialization is used for optimizing programs
based on partial information available only at runtime.
In this paper we apply autotuning on runtime
specialization of Sparse Matrix-Vector Multiplication
to predict a best specialization method among several.
In 91\% to 96\% of the predictions, either the best or
the second-best method is chosen. Predictions achieve
average speedups that are very close to the speedups
achievable when only the best methods are used. By
using an efficient code generator and a carefully
designed set of matrix features, we show the runtime
costs can be amortized to bring performance benefits
for many real-world cases.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2016:ERI,
author = "Mingzhou Zhou and Bo Wu and Xipeng Shen and Yaoqing
Gao and Graham Yiu",
title = "Examining and Reducing the Influence of Sampling
Errors on Feedback-Driven Optimizations",
journal = j-TACO,
volume = "13",
number = "1",
pages = "6:1--6:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2851502",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Feedback-driven optimization (FDO) is an important
component in mainstream compilers. By allowing the
compiler to reoptimize the program based on some
profiles of the program's dynamic behaviors, it often
enhances the quality of the generated code
substantially. A barrier for using FDO is that it often
requires many training runs to collect enough profiles
to amortize the sensitivity of program optimizations to
program input changes. Various sampling techniques have
been explored to alleviate this time-consuming process.
However, the lowered profile accuracy caused by
sampling often hurts the benefits of FDO. This article
gives the first systematic study in how sampling rates
affect the accuracy of collected profiles and how the
accuracy correlates with the usefulness of the profile
for modern FDO. Studying basic block and edge profiles
for FDO in two mature compilers reveals several
counterintuitive observations, one of which is that
profiling accuracy does not strongly correlate with the
benefits of the FDO. A detailed analysis identifies
three types of sampling-caused errors that critically
impair the quality of the profiles for FDO. It then
introduces a simple way to rectify profiles based on
the findings. Experiments demonstrate that the simple
rectification fixes most of those critical errors in
sampled profiles and significantly enhances the
effectiveness of FDO.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dantras:2016:OIB,
author = "Amanieu D'antras and Cosmin Gorgovan and Jim Garside
and Mikel Luj{\'a}n",
title = "Optimizing Indirect Branches in Dynamic Binary
Translators",
journal = j-TACO,
volume = "13",
number = "1",
pages = "7:1--7:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2866573",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic binary translation is a technology for
transparently translating and modifying a program at
the machine code level as it is running. A significant
factor in the performance of a dynamic binary
translator is its handling of indirect branches. Unlike
direct branches, which have a known target at
translation time, an indirect branch requires
translating a source program counter address to a
translated program counter address every time the
branch is executed. This translation can impose a
serious runtime penalty if it is not handled
efficiently. MAMBO-X64, a dynamic binary translator
that translates 32-bit ARM (AArch32) code to 64-bit ARM
(AArch64) code, uses three novel techniques to improve
the performance of indirect branch translation.
Together, these techniques allow MAMBO-X64 to achieve a
very low performance overhead of only 10\% on average
compared to native execution of 32-bit programs.
Hardware-assisted function returns use a software
return address stack to predict the targets of function
returns, making use of several novel optimizations
while also exploiting hardware return address
prediction. This technique has a significant impact on
most benchmarks, reducing binary translation overhead
compared to native execution by 40\% on average and by
90\% on some benchmarks. Branch table inference, an
algorithm for detecting and translating branch tables,
can reduce the overhead of translated code by up to
40\% on some SPEC CPU2006 benchmarks. The remaining
indirect branches are handled using a fast atomic hash
table, which is optimized to work with multiple
threads. This last technique translates indirect
branches using a single shared hash table while
avoiding expensive synchronization in
performance-critical lookup code. This allows the
performance to be on par with thread-private hash
tables while having superior memory scalability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Martins:2016:CBS,
author = "Luiz G. A. Martins and Ricardo Nobre and Jo{\~a}o M.
P. Cardoso and Alexandre C. B. Delbem and Eduardo
Marques",
title = "Clustering-Based Selection for the Exploration of
Compiler Optimization Sequences",
journal = j-TACO,
volume = "13",
number = "1",
pages = "8:1--8:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2883614",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A large number of compiler optimizations are nowadays
available to users. These optimizations interact with
each other and with the input code in several and
complex ways. The sequence of application of
optimization passes can have a significant impact on
the performance achieved. The effect of the
optimizations is both platform and application
dependent. The exhaustive exploration of all viable
sequences of compiler optimizations for a given code
fragment is not feasible. As this exploration is a
complex and time-consuming task, several researchers
have focused on Design Space Exploration (DSE)
strategies both to select optimization sequences to
improve the performance of each function of the
application and to reduce the exploration time. In this
article, we present a DSE scheme based on a clustering
approach for grouping functions with similarities and
exploration of a reduced search space resulting from
the combination of optimizations previously suggested
for the functions in each group. The identification of
similarities between functions uses a data mining
method that is applied to a symbolic code
representation. The data mining process combines three
algorithms to generate clusters: the Normalized
Compression Distance, the Neighbor Joining, and a new
ambiguity-based clustering algorithm. Our experiments
for evaluating the effectiveness of the proposed
approach address the exploration of optimization
sequences in the context of the ReflectC compiler,
considering 49 compilation passes while targeting a
Xilinx MicroBlaze processor, and aiming at performance
improvements for 51 functions and four applications.
Experimental results reveal that the use of our
clustering-based DSE approach achieves a significant
reduction in the total exploration time of the search
space ($ 20 \times $ over a Genetic Algorithm approach)
at the same time that considerable performance speedups
(41\% over the baseline) were obtained using the
optimized codes. Additional experiments were performed
considering the LLVM compiler, considering 124
compilation passes, and targeting a LEON3 processor.
The results show that our approach achieved geometric
mean speedups of $ 1.49 \times $, $ 1.32 \times $, and
$ 1.24 \times $ for the best 10, 20, and 30 functions,
respectively, and a global improvement of 7\% over the
performance obtained when compiling with -O2.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Do:2016:PEH,
author = "Sang Wook Stephen Do and Michel Dubois",
title = "Power Efficient Hardware Transactional Memory: Dynamic
Issue of Transactions",
journal = j-TACO,
volume = "13",
number = "1",
pages = "9:1--9:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2875425",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Transactional Memory (TM) is no longer just an
academic interest as industry has started to adopt the
idea in its commercial products. In this paper, we
propose Dynamic Transaction Issue (DTI), a new scheme
that can be easily implemented on top of existing
Hardware TM (HTM) systems, provided additional
messages. Instead of wasting power and energy in
transaction aborts, Dynamic Transaction Issue puts a
processor core into a low-power state when there is a
reasonable suspicion that the current transaction
running on it will be aborted soon in the future. We
have implemented Dynamic Transaction Issue on a
cycle-accurate simulator of a multicore processor
system with out-of-order superscalar cores, augmented
with a power package and a TM package which add
accurate dynamic power estimates and a TM framework to
the simulator. Our simulation results show that Dynamic
Transaction Issue can achieve energy savings up to 37\%
from the energy consumption of a base machine with no
mechanism to suppress useless aborts. We also compare
Dynamic Transaction Issue with various alternative
hardware TM mechanisms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Evtyushkin:2016:UMC,
author = "Dmitry Evtyushkin and Dmitry Ponomarev and Nael
Abu-Ghazaleh",
title = "Understanding and Mitigating Covert Channels Through
Branch Predictors",
journal = j-TACO,
volume = "13",
number = "1",
pages = "10:1--10:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2870636",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Covert channels through shared processor resources
provide secret communication between two malicious
processes: the trojan and the spy. In this article, we
classify, analyze, and compare covert channels through
dynamic branch prediction units in modern processors.
Through experiments on a real hardware platform, we
compare contention-based channel and the channel that
is based on exploiting the branch predictor's residual
state. We analyze these channels in SMT and
single-threaded environments under both clean and noisy
conditions. Our results show that the residual
state-based channel provides a cleaner signal and is
effective even in noisy execution environments with
another application sharing the same physical core with
the trojan and the spy. We also estimate the capacity
of the branch predictor covert channels and describe a
software-only mitigation technique that is based on
randomizing the state of the predictor tables on
context switches. We show that this protection
eliminates all covert channels through the branch
prediction unit with minimal impact on performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2016:CAE,
author = "Hao Zhou and Jingling Xue",
title = "A Compiler Approach for Exploiting Partial {SIMD}
Parallelism",
journal = j-TACO,
volume = "13",
number = "1",
pages = "11:1--11:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2886101",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Existing vectorization techniques are ineffective for
loops that exhibit little loop-level parallelism but
some limited superword-level parallelism (SLP). We show
that effectively vectorizing such loops requires
partial vector operations to be executed correctly and
efficiently, where the degree of partial SIMD
parallelism is smaller than the SIMD datapath width. We
present a simple yet effective SLP compiler technique
called P aver (PArtial VEctorizeR), formulated and
implemented in LLVM as a generalization of the
traditional SLP algorithm, to optimize such partially
vectorizable loops. The key idea is to maximize SIMD
utilization by widening vector instructions used while
minimizing the overheads caused by memory access,
packing/unpacking, and/or masking operations, without
introducing new memory errors or new numeric
exceptions. For a set of 9 C/C++/Fortran applications
with partial SIMD parallelism, Paver achieves
significantly better kernel and whole-program speedups
than LLVM on both Intel's AVX and ARM's NEON.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{VanDenBraak:2016:RGR,
author = "Gert-Jan {Van Den Braak} and Henk Corporaal",
title = "{R-GPU}: a Reconfigurable {GPU} Architecture",
journal = j-TACO,
volume = "13",
number = "1",
pages = "12:1--12:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2890506",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Over the last decade, Graphics Processing Unit (GPU)
architectures have evolved from a fixed-function
graphics pipeline to a programmable, energy-efficient
compute accelerator for massively parallel
applications. The compute power arises from the GPU's
Single Instruction/Multiple Threads architecture:
concurrently running many threads and executing them as
Single Instruction/Multiple Data--style vectors.
However, compute power is still lost due to cycles
spent on data movement and control instructions instead
of data computations. Even more cycles are lost on
pipeline stalls resulting from long latency (memory)
operations. To improve not only performance but also
energy efficiency, we introduce R-GPU: a reconfigurable
GPU architecture with communicating cores. R-GPU is an
addition to a GPU, which can still be used as such, but
also has the ability to reorganize the cores of a GPU
in a reconfigurable network. In R-GPU data movement and
control is implicit in the configuration of the
network. Each core executes a fixed instruction,
reducing instruction decode count and increasing energy
efficiency. On a number of benchmarks we show an
average performance improvement of $ 2.1 \times $ over
the same GPU without modifications. We further make a
conservative power estimation of R-GPU which shows that
power consumption can be reduced by 6\%, leading to an
energy consumption reduction of 55\%, while area only
increases by a mere 4\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2016:TAA,
author = "Peng Liu and Jiyang Yu and Michael C. Huang",
title = "Thread-Aware Adaptive Prefetcher on Multicore Systems:
Improving the Performance for Multithreaded Workloads",
journal = j-TACO,
volume = "13",
number = "1",
pages = "13:1--13:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2890505",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Most processors employ hardware data prefetching
techniques to hide memory access latencies. However,
the prefetching requests from different threads on a
multicore processor can cause severe interference with
prefetching and/or demand requests of others. The data
prefetching can lead to significant performance
degradation due to shared resource contention on shared
memory multicore systems. This article proposes a
thread-aware data prefetching mechanism based on
low-overhead runtime information to tune prefetching
modes and aggressiveness, mitigating the resource
contention in the memory system. Our solution has three
new components: (1) a self-tuning prefetcher that uses
runtime feedback to dynamically adjust data prefetching
modes and arguments of each thread, (2) a filtering
mechanism that informs the hardware about which
prefetching request can cause shared data invalidation
and should be discarded, and (3) a limiter thread
acceleration mechanism to estimate and accelerate the
critical thread which has the longest completion time
in the parallel region of execution. On a set of
multithreaded parallel benchmarks, our thread-aware
data prefetching mechanism improves the overall
performance of 64-core system by 13\% over a multimode
prefetch baseline system with two-level cache
organization and conventional modified, exclusive,
shared, and invalid-based directory coherence protocol.
We compare our approach with the feedback directed
prefetching technique and find that it provides 9\%
performance improvement on multicore systems, while
saving the memory bandwidth consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gorgovan:2016:MLO,
author = "Cosmin Gorgovan and Amanieu D'antras and Mikel
Luj{\'a}n",
title = "{MAMBO}: a Low-Overhead Dynamic Binary Modification
Tool for {ARM}",
journal = j-TACO,
volume = "13",
number = "1",
pages = "14:1--14:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2896451",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the ARM architecture expands beyond its traditional
embedded domain, there is a growing interest in dynamic
binary modification (DBM) tools for general-purpose
multicore processors that are part of the ARM family.
Existing DBM tools for ARM suffer from introducing
large overheads in the execution of applications. The
specific questions that this article addresses are (i)
how to develop such DBM tools for the ARM architecture
and (ii) whether new optimisations are plausible and
needed. We describe the general design of MAMBO, a new
DBM tool for ARM, which we release together with this
publication, and introduce novel optimisations to
handle indirect branches. In addition, we explore
scenarios in which it may be possible to relax the
transparency offered by DBM tools to allow extra
optimisations to be applied. These scenarios arise from
analysing the most typical usages: for example,
application binaries without handcrafted assembly. The
performance evaluation shows that MAMBO introduces
small overheads for SPEC CPU2006 and PARSEC 3.0 when
comparing with the execution times of the unmodified
programs: a geometric mean overhead of 28\% on a
Cortex-A9 and of 34\% on a Cortex-A15 for CPU2006, and
between 27\% and 32\%, depending on the number of
threads, for PARSEC on a Cortex-A15.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Theocharis:2016:BSC,
author = "Panagiotis Theocharis and Bjorn {De Sutter}",
title = "A Bimodal Scheduler for Coarse-Grained Reconfigurable
Arrays",
journal = j-TACO,
volume = "13",
number = "2",
pages = "15:1--15:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2893475",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compilers for Course-Grained Reconfigurable Array
(CGRA) architectures suffer from long compilation times
and code quality levels far below the theoretical upper
bounds. This article presents a new scheduler, called
the Bimodal Modulo Scheduler (BMS), to map inner loops
onto (heterogeneous) CGRAs of the Architecture for
Dynamically Reconfigurable Embedded Systems (ADRES)
family. BMS significantly outperforms existing
schedulers for similar architectures in terms of
generated code quality and compilation time. This is
achieved by combining new schemes for backtracking with
extended and adapted forms of priority functions and
cost functions, as described in the article. BMS is
evaluated by mapping multimedia and software-defined
radio benchmarks onto tuned ADRES instances.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anbar:2016:EHL,
author = "Ahmad Anbar and Olivier Serres and Engin Kayraklioglu
and Abdel-Hameed A. Badawy and Tarek El-Ghazawi",
title = "Exploiting Hierarchical Locality in Deep Parallel
Architectures",
journal = j-TACO,
volume = "13",
number = "2",
pages = "16:1--16:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2897783",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallel computers are becoming deeply hierarchical.
Locality-aware programming models allow programmers to
control locality at one level through establishing
affinity between data and executing activities. This,
however, does not enable locality exploitation at other
levels. Therefore, we must conceive an efficient
abstraction of hierarchical locality and develop
techniques to exploit it. Techniques applied directly
by programmers, beyond the first level, burden the
programmer and hinder productivity. In this article, we
propose the Parallel Hierarchical Locality Abstraction
Model for Execution (PHLAME). PHLAME is an execution
model to abstract and exploit machine hierarchical
properties through locality-aware programming and a
runtime that takes into account machine
characteristics, as well as a data sharing and
communication profile of the underlying application.
This article presents and experiments with concepts and
techniques that can drive such runtime system in
support of PHLAME. Our experiments show that our
techniques scale up and achieve performance gains of up
to 88\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gonzalez-alvarez:2016:MEF,
author = "Cecilia Gonz{\'a}lez-{\'a}lvarez and Jennifer B.
Sartor and Carlos {\'A}lvarez and Daniel
Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout",
title = "{MInGLE}: an Efficient Framework for Domain
Acceleration Using Low-Power Specialized Functional
Units",
journal = j-TACO,
volume = "13",
number = "2",
pages = "17:1--17:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2898356",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The end of Dennard scaling leads to new research
directions that try to cope with the utilization wall
in modern chips, such as the design of specialized
architectures. Processor customization utilizes
transistors more efficiently, optimizing not only for
performance but also for power. However, hardware
specialization for each application is costly and
impractical due to time-to-market constraints.
Domain-specific specialization is an alternative that
can increase hardware reutilization across applications
that share similar computations. This article explores
the specialization of low-power processors with custom
instructions (CIs) that run on a specialized functional
unit. We are the first, to our knowledge, to design CIs
for an application domain and across basic blocks,
selecting CIs that maximize both performance and energy
efficiency improvements. We present the Merged
Instructions Generator for Large Efficiency (MInGLE),
an automated framework that identifies and selects CIs.
Our framework analyzes large sequences of code (across
basic blocks) to maximize acceleration potential while
also performing partial matching across applications to
optimize for reuse of the specialized hardware. To do
this, we convert the code into a new canonical
representation, the Merging Diagram, which represents
the code's functionality instead of its structure. This
is key to being able to find similarities across such
large code sequences from different applications with
different coding styles. Groups of potential CIs are
clustered depending on their similarity score to
effectively reduce the search space. Additionally, we
create new CIs that cover not only whole-body loops but
also fragments of the code to optimize hardware
reutilization further. For a set of 11 applications
from the media domain, our framework generates CIs that
significantly improve the energy-delay product (EDP)
and performance speedup. CIs with the highest
utilization opportunities achieve an average EDP
improvement of 3.8 $ \times $ compared to a baseline
processor modeled after an Intel Atom. We demonstrate
that we can efficiently accelerate a domain with
partially matched CIs, and that their design time, from
identification to selection, stays within tractable
bounds.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Andreetta:2016:FPF,
author = "Christian Andreetta and Vivien B{\'e}got and Jost
Berthold and Martin Elsman and Fritz Henglein and
Troels Henriksen and Maj-Britt Nordfang and Cosmin E.
Oancea",
title = "{FinPar}: a Parallel Financial Benchmark",
journal = j-TACO,
volume = "13",
number = "2",
pages = "18:1--18:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2898354",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Commodity many-core hardware is now mainstream, but
parallel programming models are still lagging behind in
efficiently utilizing the application parallelism.
There are (at least) two principal reasons for this.
First, real-world programs often take the form of a
deeply nested composition of parallel operators, but
mapping the available parallelism to the hardware
requires a set of transformations that are tedious to
do by hand and beyond the capability of the common
user. Second, the best optimization strategy, such as
what to parallelize and what to efficiently
sequentialize, is often sensitive to the input dataset
and therefore requires multiple code versions that are
optimized differently, which also raises
maintainability problems. This article presents three
array-based applications from the financial domain that
are suitable for gpgpu execution. Common
benchmark-design practice has been to provide the same
code for the sequential and parallel versions that are
optimized for only one class of datasets. In
comparison, we document (1) all available parallelism
via nested map-reduce functional combinators, in a
simple Haskell implementation that closely resembles
the original code structure, (2) the invariants and
code transformations that govern the main trade-offs of
a data-sensitive optimization space, and (3) report
target cpu and multiversion gpgpu code together with an
evaluation that demonstrates optimization trade-offs
and other difficulties. We believe that this work
provides useful insight into the language constructs
and compiler infrastructure capable of expressing and
optimizing such applications, and we report in-progress
work in this direction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dardaillon:2016:NCF,
author = "Micka{\"e}l Dardaillon and Kevin Marquet and Tanguy
Risset and J{\'e}r{\^o}me Martin and Henri-Pierre
Charles",
title = "A New Compilation Flow for Software-Defined Radio
Applications on Heterogeneous {MPSoCs}",
journal = j-TACO,
volume = "13",
number = "2",
pages = "19:1--19:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2910583",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The advent of portable software-defined radio ( sdr)
technology is tightly linked to the resolution of a
difficult problem: efficient compilation of signal
processing applications on embedded computing devices.
Modern wireless communication protocols use packet
processing rather than infinite stream processing and
also introduce dependencies between data value and
computation behavior leading to dynamic dataflow
behavior. Recently, parametric dataflow has been
proposed to support dynamicity while maintaining the
high level of analyzability needed for efficient
real-life implementations of signal processing
computations. This article presents a new compilation
flow that is able to compile parametric dataflow
graphs. Built on the llvm compiler infrastructure, the
compiler offers an actor-based C++ programming model to
describe parametric graphs, a compilation front end for
graph analysis, and a back end that currently matches
the Magali platform: a prototype heterogeneous MPSoC
dedicated to LTE-Advanced. We also introduce an
innovative scheduling technique, called
microscheduling, allowing one to adapt the mapping of
parametric dataflow programs to the specificities of
the different possible MPSoCs targeted. A specific
focus on fifo sizing on the target architecture is
presented. The experimental results show compilation of
3gpp lte-advanced demodulation on Magali with tight
memory size constraints. The compiled programs achieve
performance similar to handwritten code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liao:2016:DPM,
author = "Jianwei Liao and Fran{\c{c}}ois Trahay and Guoqiang
Xiao",
title = "Dynamic Process Migration Based on Block Access
Patterns Occurring in Storage Servers",
journal = j-TACO,
volume = "13",
number = "2",
pages = "20:1--20:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2899002",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "An emerging trend in developing large and complex
applications on today's high-performance computers is
to couple independent components into a comprehensive
application. The components may employ the global file
system to exchange their data when executing the
application. In order to reduce the time required for
input/output (I/O) data exchange and data transfer in
the coupled systems or other applications, this article
proposes a dynamic process migration mechanism on the
basis of block access pattern similarity for utilizing
the local file cache to exchange the data. We first
introduce the scheme of the block access counting
diagram to profile the process access pattern during a
time period on the storage server. Next, we propose an
algorithm that compares the access patterns of
processes running on different computing nodes. Last,
processes are migrated in order to group processes with
similar access patterns. Consequently, the processes on
the computing node can exchange their data by accessing
the local file cache, instead of the global file
system. The experimental results show that the proposed
process migration mechanism can reduce the execution
time required by the application because of the shorter
I/O time, as well as yield attractive I/O throughput.
In summary, this dynamic process migration technique
can work fairly well for distributed applications whose
data dependency rely on distributed file systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ashouri:2016:CCA,
author = "Amir Hossein Ashouri and Giovanni Mariani and Gianluca
Palermo and Eunjung Park and John Cavazos and Cristina
Silvano",
title = "{COBAYN}: Compiler Autotuning Framework Using
{Bayesian} Networks",
journal = j-TACO,
volume = "13",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2928270",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The variety of today's architectures forces
programmers to spend a great deal of time porting and
tuning application codes across different platforms.
Compilers themselves need additional tuning, which has
considerable complexity as the standard optimization
levels, usually designed for the average case and the
specific target architecture, often fail to bring the
best results. This article proposes COBAYN: Compiler
autotuning framework using BAYesian Networks, an
approach for a compiler autotuning methodology using
machine learning to speed up application performance
and to reduce the cost of the compiler optimization
phases. The proposed framework is based on the
application characterization done dynamically by using
independent microarchitecture features and Bayesian
networks. The article also presents an evaluation based
on using static analysis and hybrid feature collection
approaches. In addition, the article compares Bayesian
networks with respect to several state-of-the-art
machine-learning models. Experiments were carried out
on an ARM embedded platform and GCC compiler by
considering two benchmark suites with 39 applications.
The set of compiler configurations, selected by the
model (less than 7\% of the search space), demonstrated
an application performance speedup of up to 4.6 $
\times $ on Polybench (1.85 $ \times $ on average) and
3.1 $ \times $ on cBench (1.54 $ \times $ on average)
with respect to standard optimization levels. Moreover,
the comparison of the proposed technique with (i)
random iterative compilation, (ii) machine
learning--based iterative compilation, and (iii)
noniterative predictive modeling techniques shows, on
average, 1.2 $ \times $ , 1.37 $ \times $ , and 1.48 $
\times $ speedup, respectively. Finally, the proposed
method demonstrates 4 $ \times $ and 3 $ \times $
speedup, respectively, on cBench and Polybench in terms
of exploration efficiency given the same quality of the
solutions generated by the random iterative compilation
model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chrysanthou:2016:ORT,
author = "Kypros Chrysanthou and Panayiotis Englezakis and
Andreas Prodromou and Andreas Panteli and Chrysostomos
Nicopoulos and Yiannakis Sazeides and Giorgos
Dimitrakopoulos",
title = "An Online and Real-Time Fault Detection and
Localization Mechanism for Network-on-Chip
Architectures",
journal = j-TACO,
volume = "13",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2930670",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 27 16:18:10 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Networks-on-Chip (NoC) are becoming increasingly
susceptible to emerging reliability threats. The need
to detect and localize the occurrence of faults at
runtime is steadily becoming imperative. In this work,
we propose NoCAlert, a comprehensive online and
real-time fault detection and localization mechanism
that demonstrates 0\% false negatives within the
interconnect for the fault models and stimulus set used
in this study. Based on the concept of invariance
checking, NoCAlert employs a group of lightweight
microchecker modules that collectively implement
real-time hardware assertions. The checkers operate
concurrently with normal NoC operation, thus
eliminating the need for periodic, or triggered-based,
self-testing. Based on the pattern/signature of
asserted checkers, NoCAlert can pinpoint the location
of the fault at various granularity levels. Most
important, 97\% of the transient and 90\% of the
permanent faults are detected instantaneously, within a
single clock cycle upon fault manifestation. The fault
localization accuracy ranges from 90\% to 100\%,
depending on the desired localization granularity.
Extensive cycle-accurate simulations in a 64-node CMP
and analysis at the RTL netlist-level demonstrate the
efficacy of the proposed technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mehta:2016:VL,
author = "Sanyam Mehta and Pen-Chung Yew",
title = "Variable Liberalization",
journal = j-TACO,
volume = "13",
number = "3",
pages = "23:1--23:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2963101",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the wake of the current trend of increasing the
number of cores on a chip, compiler optimizations for
improving the memory performance have assumed increased
importance. Loop fusion is one such key optimization
that can alleviate memory and bandwidth wall and thus
improve parallel performance. However, we find that
loop fusion in interesting memory-intensive
applications is prevented by the existence of
dependences between temporary variables that appear in
different loop nests. Furthermore, known techniques of
allowing useful transformations in the presence of
temporary variables, such as privatization and
expansion, prove insufficient in such cases. In this
work, we introduce variable liberalization, a technique
that selectively removes dependences on temporary
variables in different loop nests to achieve loop
fusion while preserving the semantical correctness of
the optimized program. This removal of extra-stringent
dependences effectively amounts to variable expansion,
thus achieving the benefit of an increased degree of
freedom for program transformation but without an
actual expansion. Hence, there is no corresponding
increase in the memory footprint incurred. We implement
liberalization in the Pluto polyhedral compiler and
evaluate its performance on nine hot regions in five
real applications. Results demonstrate parallel
performance improvement of 1.92 $ \times $ over the
Intel compiler, averaged over the nine hot regions, and
an overall improvement of as much as 2.17 $ \times $
for an entire application, on an eight-core Intel Xeon
processor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2016:RER,
author = "Hsing-Min Chen and Carole-Jean Wu and Trevor Mudge and
Chaitali Chakrabarti",
title = "{RATT-ECC}: Rate Adaptive Two-Tiered Error Correction
Codes for Reliable {$3$D} Die-Stacked Memory",
journal = j-TACO,
volume = "13",
number = "3",
pages = "24:1--24:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957758",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes a rate-adaptive, two-tiered
error-correction scheme (RATT-ECC) that provides strong
reliability (10$^{10}$ x reduction in raw FIT rate) for
an HBM-like 3D DRAM system. The tier-1 code is a strong
symbol-based code that can correct errors due to small
granularity faults and detect errors caused by large
granularity faults; the tier-2 code is an XOR-based
code that corrects errors detected by the tier-1 code.
The rate-adaptive feature of RATT-ECC enables permanent
bank failures to be handled through sparing. It can
also be used to significantly reduce the refresh power
consumption without decreasing reliability and timing
performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2016:IDO,
author = "Wenjie Chen and Zhibin Wang and Qin Wu and Jiuzhen
Liang and Zhilei Chai",
title = "Implementing Dense Optical Flow Computation on a
Heterogeneous {FPGA SoC} in {C}",
journal = j-TACO,
volume = "13",
number = "3",
pages = "25:1--25:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2948976",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-quality optical flow computation algorithms are
computationally intensive. The low computational speed
of such algorithms causes difficulties for real-world
applications. In this article, we propose an optimized
implementation of the classical
Combine-Brightness-Gradient (CBG) model on the Xilinx
ZYNQ FPGA-SoC, by taking advantage of the inherent
algorithmic parallelism and ZYNQ architecture. The
execution time decreases to 0.82 second with a lower
power consumption (1.881W). It is better than software
implementation on PC (Intel i7-3520M, 2.9GHz), which
costs 2.635 seconds and 35W. We use C rather than HDLs
to describe the algorithm for rapid prototyping.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vaish:2016:OMT,
author = "Nilay Vaish and Michael C. Ferris and David A. Wood",
title = "Optimization Models for Three On-Chip Network
Problems",
journal = j-TACO,
volume = "13",
number = "3",
pages = "26:1--26:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2943781",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We model three on-chip network design problems-memory
controller placement, resource allocation in
heterogeneous on-chip networks, and their
combination-as mathematical optimization problems. We
model the first two problems as mixed integer linear
programs. We model the third problem as a mixed integer
nonlinear program, which we then linearize exactly.
Sophisticated optimization algorithms enable solutions
to be obtained much more efficiently. Detailed
simulations using synthetic traffic and benchmark
applications validate that our designs provide better
performance than solutions proposed previously. Our
work provides further evidence toward suitability of
optimization models in searching/pruning architectural
design space.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sardashti:2016:YAC,
author = "Somayeh Sardashti and Andre Seznec and David A. Wood",
title = "Yet Another Compressed Cache: a Low-Cost Yet Effective
Compressed Cache",
journal = j-TACO,
volume = "13",
number = "3",
pages = "27:1--27:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2976740",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache memories play a critical role in bridging the
latency, bandwidth, and energy gaps between cores and
off-chip memory. However, caches frequently consume a
significant fraction of a multicore chip's area and
thus account for a significant fraction of its cost.
Compression has the potential to improve the effective
capacity of a cache, providing the performance and
energy benefits of a larger cache while using less
area. The design of a compressed cache must address two
important issues: (i) a low-latency, low-overhead
compression algorithm that can represent a fixed-size
cache block using fewer bits and (ii) a cache
organization that can efficiently store the resulting
variable-size compressed blocks. This article focuses
on the latter issue. Here, we propose Yet Another
Compressed Cache (YACC), a new compressed cache design
that targets improving effective cache capacity with a
simple design. YACC uses super-blocks to reduce tag
overheads while packing variable-size compressed blocks
to reduce internal fragmentation. YACC achieves the
benefits of two state-of-the art compressed
caches-Decoupled Compressed Cache (DCC) [Sardashti and
Wood 2013a, 2013b] and Skewed Compressed Cache (SCC)
[Sardashti et al. 2014]-with a more practical and
simpler design. YACC's cache layout is similar to
conventional caches, with a largely unmodified tag
array and unmodified data array. Compared to DCC and
SCC, YACC requires neither the significant extra
metadata (i.e., back pointers) needed by DCC to track
blocks nor the complexity and overhead of skewed
associativity (i.e., indexing ways differently) needed
by SCC. An additional advantage over previous work is
that YACC enables modern replacement mechanisms, such
as RRIP. For our benchmark set, compared to a
conventional uncompressed 8MB LLC, YACC improves
performance by 8\% on average and up to 26\%, and
reduces total energy by 6\% on average and up to 20\%.
An 8MB YACC achieves approximately the same performance
and energy improvements as a 16MB conventional cache at
a much smaller silicon footprint, with only 1.6\%
greater area than an 8MB conventional cache. YACC
performs comparably to DCC and SCC but is much simpler
to implement.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cruz:2016:HAT,
author = "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio
L. Pilla and Philippe O. A. Navaux",
title = "Hardware-Assisted Thread and Data Mapping in
Hierarchical Multicore Architectures",
journal = j-TACO,
volume = "13",
number = "3",
pages = "28:1--28:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2975587",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The performance and energy efficiency of modern
architectures depend on memory locality, which can be
improved by thread and data mappings considering the
memory access behavior of parallel applications. In
this article, we propose intense pages mapping, a
mechanism that analyzes the memory access behavior
using information about the time the entry of each page
resides in the translation lookaside buffer. It
provides accurate information with a very low overhead.
We present experimental results with simulation and
real machines, with average performance improvements of
13.7\% and energy savings of 4.4\%, which come from
reductions in cache misses and interconnection
traffic.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Adileh:2016:MHP,
author = "Almutaz Adileh and Stijn Eyerman and Aamer Jaleel and
Lieven Eeckhout",
title = "Maximizing Heterogeneous Processor Performance Under
Power Constraints",
journal = j-TACO,
volume = "13",
number = "3",
pages = "29:1--29:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2976739",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Heterogeneous processors (e.g., ARM's big.LITTLE)
improve performance in power-constrained environments
by executing applications on the `little' low-power
core and move them to the `big' high-performance core
when there is available power budget. The total time
spent on the big core depends on the rate at which the
application dissipates the available power budget. When
applications with different big-core power consumption
characteristics concurrently execute on a heterogeneous
processor, it is best to give a larger share of the
power budget to applications that can run longer on the
big core, and a smaller share to applications that run
for a very short duration on the big core. This article
investigates mechanisms to manage the available power
budget on power-constrained heterogeneous processors.
We show that existing proposals that schedule
applications onto a big core based on various
performance metrics are not high performing, as these
strategies do not optimize over an entire power period
and are unaware of the applications' power/performance
characteristics. We use linear programming to design
the DPDP power management technique, which guarantees
optimal performance on heterogeneous processors. We
mathematically derive a metric (Delta Performance by
Delta Power) that takes into account the
power/performance characteristics of each running
application and allows our power-management technique
to decide how best to distribute the available power
budget among the co-running applications at minimal
overhead. Our evaluations with a 4-core heterogeneous
processor consisting of big.LITTLE pairs show that DPDP
improves performance by 16\% on average and up to 40\%
compared to a strategy that globally and greedily
optimizes the power budget. We also show that DPDP
outperforms existing heterogeneous scheduling policies
that use performance metrics to decide how best to
schedule applications on the big core.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wibowo:2016:ACL,
author = "Bagus Wibowo and Abhinav Agrawal and Thomas Stanton
and James Tuck",
title = "An Accurate Cross-Layer Approach for Online
Architectural Vulnerability Estimation",
journal = j-TACO,
volume = "13",
number = "3",
pages = "30:1--30:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2975588",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Processor soft-error rates are projected to increase
as feature sizes scale down, necessitating the adoption
of reliability-enhancing techniques, but power and
performance overhead remain a concern of such
techniques. Dynamic cross-layer techniques are a
promising way to improve the cost-effectiveness of
resilient systems. As a foundation for making such a
system, we propose a cross-layer approach for
estimating the architectural vulnerability of a
processor core online that works by combining
information from software, compiler, and
microarchitectural layers at runtime. The hardware
layer combines the metadata from software and compiler
layers with microarchitectural measurements to estimate
architectural vulnerability online. We describe our
design and evaluate it in detail on a set of SPEC CPU
2006 applications. We find that our online AVF estimate
is highly accurate with respect to a postmortem AVF
analysis, with only 0.46\% average absolute error.
Also, our design incurs negligible performance impact
for SPEC2006 applications and about 1.2\% for a Monte
Carlo application, requires approximately 1.4\% area
overhead, and costs about 3.3\% more power on average.
We compare our technique against two prior online AVF
estimation techniques, one using a linear regression to
estimate AVF and another based on PVF-HVF; our
evaluation finds that our approach, on average, is more
accurate. Our case study of a Monte Carlo simulation
shows that our AVF estimate can adapt to the inherent
resiliency of the algorithm. Finally, we demonstrate
the effectiveness of our approach using a dynamic
protection scheme that limits vulnerability to soft
errors while reducing the energy consumption by an
average of 4.8\%, and with a target normalized SER of
10\%, compared to enabling a simple parity+ECC
protection at all times.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Acacio:2016:LDR,
author = "Manuel Acacio",
title = "List of Distinguished Reviewers {ACM TACO 2014}",
journal = j-TACO,
volume = "13",
number = "3",
pages = "31:1--31:??",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2989990",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 17 16:20:58 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vora:2016:SAE,
author = "Keval Vora and Rajiv Gupta and Guoqing Xu",
title = "Synergistic Analysis of Evolving Graphs",
journal = j-TACO,
volume = "13",
number = "4",
pages = "32:1--32:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2992784",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Evolving graph processing involves repeating analyses,
which are often iterative, over multiple snapshots of
the graph corresponding to different points in time.
Since the snapshots of an evolving graph share a great
number of vertices and edges, traditional approaches
that process these snapshots one at a time without
exploiting this overlap contain much wasted effort on
both data loading and computation, making them
extremely inefficient. In this article, we identify
major sources of inefficiencies and present two
optimization techniques to address them. First, we
propose a technique for amortizing the fetch cost by
merging fetching of values for different snapshots of
the same vertex. Second, we propose a technique for
amortizing the processing cost by feeding values
computed by earlier snapshots into later snapshots. We
have implemented these optimizations in two distributed
graph processing systems, namely, GraphLab and ASPIRE.
Our experiments with multiple real evolving graphs and
algorithms show that, on average fetch amortization
speeds up execution of GraphLab and ASPIRE by 5.2$
\times $ and 4.1$ \times $ , respectively. Amortizing
the processing cost yields additional average speedups
of 2$ \times $ and 7.9$ \times $, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2016:CPS,
author = "Yunquan Zhang and Shigang Li and Shengen Yan and
Huiyang Zhou",
title = "A Cross-Platform {SpMV} Framework on Many-Core
Architectures",
journal = j-TACO,
volume = "13",
number = "4",
pages = "33:1--33:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2994148",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Sparse Matrix-Vector multiplication (SpMV) is a key
operation in engineering and scientific computing.
Although the previous work has shown impressive
progress in optimizing SpMV on many-core architectures,
load imbalance and high memory bandwidth remain the
critical performance bottlenecks. We present our novel
solutions to these problems, for both GPUs and Intel
MIC many-core architectures. First, we devise a new
SpMV format, called Blocked Compressed Common
Coordinate (BCCOO). BCCOO extends the blocked Common
Coordinate (COO) by using bit flags to store the row
indices to alleviate the bandwidth problem. We further
improve this format by partitioning the matrix into
vertical slices for better data locality. Then, to
address the load imbalance problem, we propose a highly
efficient matrix-based segmented sum/scan algorithm for
SpMV, which eliminates global synchronization. At last,
we introduce an autotuning framework to choose
optimization parameters. Experimental results show that
our proposed framework has a significant advantage over
the existing SpMV libraries. In single precision, our
proposed scheme outperforms clSpMV COCKTAIL format by
255\% on average on AMD FirePro W8000, and outperforms
CUSPARSE V7.0 by 73.7\% on average and outperforms CSR5
by 53.6\% on average on GeForce Titan X; in double
precision, our proposed scheme outperforms CUSPARSE
V7.0 by 34.0\% on average and outperforms CSR5 by
16.2\% on average on Tesla K20, and has equivalent
performance compared with CSR5 on Intel MIC.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahn:2016:AEE,
author = "Junwhan Ahn and Sungjoo Yoo and Kiyoung Choi",
title = "{AIM}: Energy-Efficient Aggregation Inside the Memory
Hierarchy",
journal = j-TACO,
volume = "13",
number = "4",
pages = "34:1--34:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2994149",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we propose Aggregation-in-Memory
(AIM), a new processing-in-memory system designed for
energy efficiency and near-term adoption. In order to
efficiently perform aggregation, we implement simple
aggregation operations in main memory and develop a
locality-adaptive host architecture for in-memory
aggregation, called cache-conscious aggregation.
Through this, AIM executes aggregation at the most
energy-efficient location among all levels of the
memory hierarchy. Moreover, AIM minimally changes
existing sequential programming models and provides
fully automated compiler toolchain, thereby allowing
unmodified legacy software to use AIM. Evaluations show
that AIM greatly improves the energy efficiency of main
memory and the system performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ziabari:2016:UHB,
author = "Amir Kavyan Ziabari and Yifan Sun and Yenai Ma and
Dana Schaa and Jos{\'e} L. Abell{\'a}n and Rafael Ubal
and John Kim and Ajay Joshi and David Kaeli",
title = "{UMH}: a Hardware-Based Unified Memory Hierarchy for
Systems with Multiple Discrete {GPUs}",
journal = j-TACO,
volume = "13",
number = "4",
pages = "35:1--35:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996190",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we describe how to ease memory
management between a Central Processing Unit (CPU) and
one or multiple discrete Graphic Processing Units
(GPUs) by architecting a novel hardware-based Unified
Memory Hierarchy (UMH). Adopting UMH, a GPU accesses
the CPU memory only if it does not find its required
data in the directories associated with its
high-bandwidth memory, or the NMOESI coherency protocol
limits the access to that data. Using UMH with NMOESI
improves performance of a CPU-multiGPU system by at
least 1.92 $ \times $ in comparison to alternative
software-based approaches. It also allows the CPU to
access GPUs modified data by at least 13 $ \times $
faster.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Spink:2016:HAC,
author = "Tom Spink and Harry Wagstaff and Bj{\"o}rn Franke",
title = "Hardware-Accelerated Cross-Architecture Full-System
Virtualization",
journal = j-TACO,
volume = "13",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996798",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Hardware virtualization solutions provide users with
benefits ranging from application isolation through
server consolidation to improved disaster recovery and
faster server provisioning. While hardware assistance
for virtualization is supported by all major processor
architectures, including Intel, ARM, PowerPC, and MIPS,
these extensions are targeted at virtualization of the
same architecture, for example, an x86 guest on an x86
host system. Existing techniques for cross-architecture
virtualization, for example, an ARM guest on an x86
host, still incur a substantial overhead for CPU,
memory, and I/O virtualization due to the necessity for
software emulation of these mismatched system
components. In this article, we present a new
hardware-accelerated hypervisor called C aptive,
employing a range of novel techniques that exploit
existing hardware virtualization extensions for
improving the performance of full-system cross-platform
virtualization. We illustrate how (1) guest memory
management unit (MMU) events and operations can be
mapped onto host memory virtualization extensions,
eliminating the need for costly software MMU emulation,
(2) a block-based dynamic binary translation engine
inside the virtual machine can improve CPU
virtualization performance, (3) memory-mapped guest I/O
can be efficiently translated to fast I/O specific
calls to emulated devices, and (4) the cost for
asynchronous guest interrupts can be reduced. For an
ARM-based Linux guest system running on an x86 host
with Intel VT support, we demonstrate application
performance levels, based on SPEC CPU2006 benchmarks,
of up to 5.88$ \times $ over state-of-the-art Qemu and
2.5$ \times $ on average, achieving a guest dynamic
instruction throughput of up to 1280 MIPS (million
instructions per second) and 915.52 MIPS, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shi:2016:LLA,
author = "Qingchuan Shi and George Kurian and Farrukh Hijaz and
Srinivas Devadas and Omer Khan",
title = "{LDAC}: Locality-Aware Data Access Control for
Large-Scale Multicore Cache Hierarchies",
journal = j-TACO,
volume = "13",
number = "4",
pages = "37:1--37:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2983632",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The trend of increasing the number of cores to achieve
higher performance has challenged efficient management
of on-chip data. Moreover, many emerging applications
process massive amounts of data with varying degrees of
locality. Therefore, exploiting locality to improve
on-chip traffic and resource utilization is of
fundamental importance. Conventional multicore cache
management schemes either manage the private cache (L1)
or the Last-Level Cache (LLC), while ignoring the
other. We propose a holistic locality-aware cache
hierarchy management protocol for large-scale
multicores. The proposed scheme improves on-chip data
access latency and energy consumption by intelligently
bypassing cache line replication in the L1 caches,
and/or intelligently replicating cache lines in the
LLC. The approach relies on low overhead yet highly
accurate in-hardware runtime classification of data
locality at both L1 cache and the LLC. The decision to
bypass L1 and/or replicate in LLC is then based on the
measured reuse at the fine granularity of cache lines.
The locality tracking mechanism is decoupled from the
sharer tracking structures that cause scalability
concerns in traditional cache coherence protocols.
Moreover, the complexity of the protocol is low since
no additional coherence states are created. However,
the proposed classifier incurs a 5.6 KB per-core
storage overhead. On a set of parallel benchmarks, the
locality-aware protocol reduces average energy
consumption by 26\% and completion time by 16\%, when
compared to the state-of-the-art Reactive-NUCA
multicore cache management scheme.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fernandes:2016:EHO,
author = "Fernando Fernandes and Lucas Weigel and Claudio Jung
and Philippe Navaux and Luigi Carro and Paolo Rech",
title = "Evaluation of Histogram of Oriented Gradients Soft
Errors Criticality for Automotive Applications",
journal = j-TACO,
volume = "13",
number = "4",
pages = "38:1--38:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2998573",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Pedestrian detection reliability is a key problem for
autonomous or aided driving, and methods that use
Histogram of Oriented Gradients (HOG) are very popular.
Embedded Graphics Processing Units (GPUs) are exploited
to run HOG in a very efficient manner. Unfortunately,
GPUs architecture has been shown to be particularly
vulnerable to radiation-induced failures. This article
presents an experimental evaluation and analytical
study of HOG reliability. We aim at quantifying and
qualifying the radiation-induced errors on pedestrian
detection applications executed in embedded GPUs. We
analyze experimental results obtained executing HOG on
embedded GPUs from two different vendors, exposed for
about 100 hours to a controlled neutron beam at Los
Alamos National Laboratory. We consider the number and
position of detected objects as well as precision and
recall to discriminate critical erroneous computations.
The reported analysis shows that, while being
intrinsically resilient (65\% to 85\% of output errors
only slightly impact detection), HOG experienced some
particularly critical errors that could result in
undetected pedestrians or unnecessary vehicle stops.
Additionally, we perform a fault-injection campaign to
identify HOG critical procedures. We observe that
Resize and Normalize are the most sensitive and
critical phases, as about 20\% of injections generate
an output error that significantly impacts HOG
detection. With our insights, we are able to find those
limited portions of HOG that, if hardened, are more
likely to increase reliability without introducing
unnecessary overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dublish:2016:CCG,
author = "Saumay Dublish and Vijay Nagarajan and Nigel Topham",
title = "Cooperative Caching for {GPUs}",
journal = j-TACO,
volume = "13",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001589",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The rise of general-purpose computing on GPUs has
influenced architectural innovation on them. The
introduction of an on-chip cache hierarchy is one such
innovation. High L1 miss rates on GPUs, however,
indicate inefficient cache usage due to myriad factors,
such as cache thrashing and extensive multithreading.
Such high L1 miss rates in turn place high demands on
the shared L2 bandwidth. Extensive congestion in the L2
access path therefore results in high memory access
latencies. In memory-intensive applications, these
latencies get exposed due to a lack of active compute
threads to mask such high latencies. In this article,
we aim to reduce the pressure on the shared L2
bandwidth, thereby reducing the memory access latencies
that lie in the critical path. We identify significant
replication of data among private L1 caches, presenting
an opportunity to reuse data among L1s. We further show
how this reuse can be exploited via an L1 Cooperative
Caching Network (CCN), thereby reducing the bandwidth
demand on L2. In the proposed architecture, we connect
the L1 caches with a lightweight ring network to
facilitate intercore communication of shared data. We
show that this technique reduces traffic to the L2
cache by an average of 29\%, freeing up the bandwidth
for other accesses. We also show that the CCN reduces
the average memory latency by 24\%, thereby reducing
core stall cycles by 26\% on average. This translates
into an overall performance improvement of 14.7\% on
average (and up to 49\%) for applications that exhibit
reuse across L1 caches. In doing so, the CCN incurs a
nominal area and energy overhead of 1.3\% and 2.5\%,
respectively. Notably, the performance improvement with
our proposed CCN compares favorably to the performance
improvement achieved by simply doubling the number of
L2 banks by up to 34\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tampouratzis:2016:AIH,
author = "Nikolaos Tampouratzis and Pavlos M. Mattheakis and
Ioannis Papaefstathiou",
title = "Accelerating Intercommunication in Highly Parallel
Systems",
journal = j-TACO,
volume = "13",
number = "4",
pages = "40:1--40:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3005717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Every HPC system consists of numerous processing nodes
interconnect using a number of different inter-process
communication protocols such as Messaging Passing
Interface (MPI) and Global Arrays (GA). Traditionally,
research has focused on optimizing these protocols and
identifying the most suitable ones for each system
and/or application. Recently, there has been a proposal
to unify the primitive operations of the different
inter-processor communication protocols through the
Portals library. Portals offer a set of low-level
communication routines which can be composed in order
to implement the functionality of different
intercommunication protocols. However, Portals
modularity comes at a performance cost, since it adds
one more layer in the actual protocol implementation.
This work aims at closing the performance gap between a
generic and reusable intercommunication layer, such as
Portals, and the several monolithic and highly
optimized intercommunication protocols. This is
achieved through the development of a novel hardware
offload engine efficiently implementing the basic
Portals' modules. Our innovative system is up to two2
orders of magnitude faster than the conventional
software implementation of Portals' while the speedup
achieved over the conventional monolithic software
implementations of MPI and GAs is more than an order of
magnitude. The power consumption of our hardware system
is less than 1/100th of what a low-power CPU consumes
when executing the Portal's software while its silicon
cost is less than 1/10th of that of a very simple RISC
CPU. Moreover, our design process is also innovative
since we have first modeled the hardware within an
untimed virtual prototype which allowed for rapid
design space exploration; then we applied a novel
methodology to transform the untimed description into
an efficient timed hardware description, which was then
transformed into a hardware netlist through a
High-Level Synthesis (HLS) tool.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Park:2016:CJP,
author = "Hyukwoo Park and Myungsu Cha and Soo-Mook Moon",
title = "Concurrent {JavaScript} Parsing for Faster Loading of
{Web} Apps",
journal = j-TACO,
volume = "13",
number = "4",
pages = "41:1--41:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3004281",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "JavaScript is a dynamic language mainly used as a
client-side web script. Nowadays, web is evolving into
an application platform with its web apps, and
JavaScript increasingly undertakes complex computations
and interactive user interfaces, requiring a
high-performance JavaScript engine. There have been
many optimizations for efficient JavaScript engines,
but one component that has not been optimized much is
JavaScript parsing. A JavaScript function needs to be
parsed before being executed, and the parsing overhead
takes a substantial portion of JavaScript execution
time for web apps, especially during app loading. This
article proposes concurrent parsing of JavaScript,
which performs the parsing of JavaScript functions in
advance on different threads, while the main thread is
executing the parsed JavaScript functions. This can
hide the parsing overhead from the main execution
thread, reducing the JavaScript execution time, thus
reducing the overall app loading time. More
specifically, we separated JavaScript parsing and made
it run on different threads without violating the
execution semantics of JavaScript. We also designed an
efficient multi-threaded parsing architecture, which
reduces the synchronization overhead and schedules the
parsing requests appropriately. Finally, we explored
two methods of choosing the target functions for
concurrent parsing: one based on profiled information
and the other based on speculative heuristics. We
performed experiments on the WebKit browser with the
JSC engine for real web apps. The result shows that the
proposed concurrent parsing can improve the JavaScript
performance during app loading by as much as 64\% and
by 39.7\% on average. This improves the whole app
loading performance tangibly, by as much as 32.7\% and
by 18.2\%, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xiong:2016:MAS,
author = "Dongliang Xiong and Kai Huang and Xiaowen Jiang and
Xiaolang Yan",
title = "Memory Access Scheduling Based on Dynamic Multilevel
Priority in Shared {DRAM} Systems",
journal = j-TACO,
volume = "13",
number = "4",
pages = "42:1--42:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007647",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Interapplication interference at shared main memory
severely degrades performance and increasing DRAM
frequency calls for simple memory schedulers. Previous
memory schedulers employ a per-application ranking
scheme for high system performance or a per-group
ranking scheme for low hardware cost, but few provide a
balance. We propose DMPS, a memory scheduler based on
dynamic multilevel priority. First, DMPS uses ``memory
occupancy'' to measure interference quantitatively.
Second, DMPS groups applications, favors
latency-sensitive groups, and dynamically prioritizes
applications by employing a per-level ranking scheme.
The simulation results show that DMPS has 7.2\% better
system performance and 22\% better fairness over FRFCFS
at low hardware complexity and cost.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{DeSensi:2016:RAP,
author = "Daniele {De Sensi} and Massimo Torquati and Marco
Danelutto",
title = "A Reconfiguration Algorithm for Power-Aware Parallel
Applications",
journal = j-TACO,
volume = "13",
number = "4",
pages = "43:1--43:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3004054",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In current computing systems, many applications
require guarantees on their maximum power consumption
to not exceed the available power budget. On the other
hand, for some applications, it could be possible to
decrease their performance, yet maintain an acceptable
level, in order to reduce their power consumption. To
provide such guarantees, a possible solution consists
in changing the number of cores assigned to the
application, their clock frequency, and the placement
of application threads over the cores. However, power
consumption and performance have different trends
depending on the application considered and on its
input. Finding a configuration of resources satisfying
user requirements is, in the general case, a
challenging task. In this article, we propose Nornir,
an algorithm to automatically derive, without relying
on historical data about previous executions,
performance and power consumption models of an
application in different configurations. By using these
models, we are able to select a close-to-optimal
configuration for the given user requirement, either
performance or power consumption. The configuration of
the application will be changed on-the-fly throughout
the execution to adapt to workload fluctuations,
external interferences, and/or application's phase
changes. We validate the algorithm by simulating it
over the applications of the Parsec benchmark suit.
Then, we implement our algorithm and we analyse its
accuracy and overhead over some of these applications
on a real execution environment. Eventually, we compare
the quality of our proposal with that of the optimal
algorithm and of some state-of-the-art solutions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jantz:2016:IIP,
author = "Michael R. Jantz and Forrest J. Robinson and Prasad A.
Kulkarni",
title = "Impact of Intrinsic Profiling Limitations on
Effectiveness of Adaptive Optimizations",
journal = j-TACO,
volume = "13",
number = "4",
pages = "44:1--44:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3008661",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many performance optimizations rely on or are enhanced
by runtime profile information. However, both offline
and online profiling techniques suffer from intrinsic
and practical limitations that affect the quality of
delivered profile data. The quality of profile data is
its ability to accurately predict (relevant aspects of)
future program behavior. While these limitations are
known, their impact on the effectiveness of
profile-guided optimizations, compared to the ideal
performance, is not as well understood. We define ideal
performance for adaptive optimizations as that achieved
with a precise profile of future program behavior. In
this work, we study and quantify the performance impact
of fundamental profiling limitations by comparing the
effectiveness of typical adaptive optimizations when
using the best profiles generated by offline and online
schemes against a baseline where the adaptive
optimization is given access to profile information
about the future execution of the program. We model and
compare the behavior of three adaptive JVM
optimizations-heap memory management using object usage
profiles, code cache management using method usage
profiles, and selective just-in-time compilation using
method hotness profiles-for the Java DaCapo benchmarks.
Our results provide insight into the advantages and
drawbacks of current profiling strategies and shed
light on directions for future profiling research.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Damschen:2016:EWP,
author = "Marvin Damschen and Lars Bauer and J{\"o}rg Henkel",
title = "Extending the {WCET} Problem to Optimize for
Runtime-Reconfigurable Processors",
journal = j-TACO,
volume = "13",
number = "4",
pages = "45:1--45:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3014059",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The correctness of a real-time system does not depend
on the correctness of its calculations alone but also
on the non-functional requirement of adhering to
deadlines. Guaranteeing these deadlines by static
timing analysis, however, is practically infeasible for
current microarchitectures with out-of-order scheduling
pipelines, several hardware threads, and multiple
(shared) cache layers. Novel timing-analyzable features
are required to sustain the strongly increasing demand
for processing power in real-time systems. Recent
advances in timing analysis have shown that
runtime-reconfigurable instruction set processors are
one way to escape the scarcity of analyzable processing
power while preserving the flexibility of the system.
When moving calculations from software to hardware by
means of reconfigurable custom instructions
(CIs)-additional to a considerable speedup-the
overestimation of a task's worst-case execution time
(WCET) can be reduced. CIs typically implement
functionality that corresponds to several hundred
instructions on the central processing unit (CPU)
pipeline. While analyzing instructions for worst-case
latency may introduce pessimism, the latency of
CIs-executed on the reconfigurable fabric-is precisely
known. In this work, we introduce the problem of
selecting reconfigurable CIs to optimize the WCET of an
application. We model this problem as an extension to
state-of-the-art integer linear programming (ILP)-based
program path analysis. This way, we enable optimization
based on accurate WCET estimates with integration of
information about global program flow, for example,
infeasible paths. We present an optimal solution with
effective techniques to prune the search space and a
greedy heuristic that performs a maximum number of
steps linear in the number of partitions of
reconfigurable area available. Finally, we show the
effectiveness of optimizing the WCET on a
reconfigurable processor by evaluating a complex
multimedia application with multiple reconfigurable CIs
for several hardware parameters.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2016:MAP,
author = "Zheng Li and Fang Wang and Dan Feng and Yu Hua and
Jingning Liu and Wei Tong",
title = "{MaxPB}: Accelerating {PCM} Write by Maximizing the
Power Budget Utilization",
journal = j-TACO,
volume = "13",
number = "4",
pages = "46:1--46:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3012007",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase Change Memory (PCM) is one of the promising
memory technologies but suffers from some critical
problems such as poor write performance and high write
energy consumption. Due to the high write energy
consumption and limited power supply, the size of
concurrent bit-write is restricted inside one PCM chip.
Typically, the size of concurrent bit-write is much
less than the cache line size and it is normal that
many serially executed write units are consumed to
write down the data block to PCM when using it as the
main memory. Existing state-of-the-art PCM write
schemes, such as FNW (Flip-N-Write) and
two-stage-write, address the problem of poor
performance by improving the write parallelism under
the power constraints. The parallelism is obtained via
reducing the data amount and leveraging power as well
as time asymmetries, respectively. However, due to the
extremely pessimistic assumptions of current
utilization (FNW) and optimistic assumptions of
asymmetries (two-stage-write), these schemes fail to
maximize the power supply utilization and hence improve
the write parallelism. In this article, we propose a
novel PCM write scheme, called MaxPB (Maximize the
Power Budget utilization) to maximize the power budget
utilization with minimum changes about the circuits
design. MaxPB is a ``think before acting'' method. The
main idea of MaxPB is to monitor the actual power needs
of all data units first and then effectively package
them into the least number of write units under the
power constraints. Experimental results show the
efficiency and performance improvements on MaxPB. For
example, four-core PARSEC and SPEC experimental results
show that MaxPB gets 32.0\% and 20.3\% more read
latency reduction, 26.5\% and 16.1\% more write latency
reduction, 24.3\% and 15.6\% more running time
decrease, 1.32$ \times $ and 0.92$ \times $ more
speedup, as well as 30.6\% and 18.4\% more energy
consumption reduction on average compared with the
state-of-the-art FNW and two-stage-write write schemes,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Muralidharan:2016:DTN,
author = "Saurav Muralidharan and Michael Garland and Albert
Sidelnik and Mary Hall",
title = "Designing a Tunable Nested Data-Parallel Programming
System",
journal = j-TACO,
volume = "13",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3012011",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes Surge, a nested data-parallel
programming system designed to simplify the porting and
tuning of parallel applications to multiple target
architectures. Surge decouples high-level specification
of computations, expressed using a C++ programming
interface, from low-level implementation details using
two first-class constructs: schedules and policies.
Schedules describe the valid ways in which
data-parallel operators may be implemented, while
policies encapsulate a set of parameters that govern
platform-specific code generation. These two mechanisms
are used to implement a code generation system that
analyzes computations and automatically generates a
search space of valid platform-specific
implementations. An input and architecture-adaptive
autotuning system then explores this search space to
find optimized implementations. We express in Surge
five real-world benchmarks from domains such as machine
learning and sparse linear algebra and from the
high-level specifications, Surge automatically
generates CPU and GPU implementations that perform on
par with or better than manually optimized versions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Akturk:2016:ABN,
author = "Ismail Akturk and Riad Akram and Mohammad Majharul
Islam and Abdullah Muzahid and Ulya R. Karpuzcu",
title = "Accuracy Bugs: a New Class of Concurrency Bugs to
Exploit Algorithmic Noise Tolerance",
journal = j-TACO,
volume = "13",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3017991",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallel programming introduces notoriously difficult
bugs, usually referred to as concurrency bugs. This
article investigates the potential for deviating from
the conventional wisdom of writing concurrency
bug-free, parallel programs. It explores the benefit of
accepting buggy but approximately correct parallel
programs by leveraging the inherent tolerance of
emerging parallel applications to inaccuracy in
computations. Under algorithmic noise tolerance, a new
class of concurrency bugs, accuracy bugs, degrade the
accuracy of computation (often at acceptable levels)
rather than causing catastrophic termination. This
study demonstrates how embracing accuracy bugs affects
the application output quality and performance and
analyzes the impact on execution semantics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tomusk:2016:SHC,
author = "Erik Tomusk and Christophe Dubach and Michael
O'Boyle",
title = "Selecting Heterogeneous Cores for Diversity",
journal = j-TACO,
volume = "13",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3014165",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Mobile devices with heterogeneous processors are
becoming mainstream. With a heterogeneous processor,
the runtime scheduler can pick the best CPU core for a
given task based on program characteristics,
performance requirements, and power limitations. For a
heterogeneous processor to be effective, it must
contain a diverse set of cores to match a range of
runtime requirements and program behaviors. Selecting a
diverse set of cores is, however, a non-trivial
problem. Power and performance are dependent on both
program features and the microarchitectural features of
cores, and a selection of cores must satisfy the
competing demands of different types of programs. We
present a method of core selection that chooses cores
at a range of power-performance points. Our algorithm
is based on the observation that it is not necessary
for a core to consistently have high performance or low
power; one type of core can fulfill different roles for
different types of programs. Given a power budget,
cores selected with our method provide an average
speedup of 6\% on EEMBC mobile benchmarks and a 24\%
speedup on SPEC 2006 integer benchmarks over the
state-of-the-art core selection method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Michaud:2016:SMF,
author = "Pierre Michaud",
title = "Some Mathematical Facts About Optimal Cache
Replacement",
journal = j-TACO,
volume = "13",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3017992",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article exposes and proves some mathematical
facts about optimal cache replacement that were
previously unknown or not proved rigorously. An
explicit formula is obtained, giving OPT hits and
misses as a function of past references. Several
mathematical facts are derived from this formula,
including a proof that OPT miss curves are always
convex, and a new algorithm called OPT tokens, for
reasoning about optimal replacement.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bao:2016:SDF,
author = "Wenlei Bao and Changwan Hong and Sudheer Chunduri and
Sriram Krishnamoorthy and Louis-No{\"e}l Pouchet and
Fabrice Rastello and P. Sadayappan",
title = "Static and Dynamic Frequency Scaling on Multicore
{CPUs}",
journal = j-TACO,
volume = "13",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3011017",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic Voltage and Frequency Scaling (DVFS) typically
adapts CPU power consumption by modifying a processor's
operating frequency (and the associated voltage).
Typical DVFS approaches include using default
strategies such as running at the lowest or the highest
frequency or reacting to the CPU's runtime load to
reduce or increase frequency based on the CPU usage. In
this article, we argue that a compile-time approach to
CPU frequency selection is achievable for affine
program regions and can significantly outperform
runtime-based approaches. We first propose a
lightweight runtime approach that can exploit the
properties of the power profile specific to a
processor, outperforming classical Linux governors such
as powersave or on-demand for computational kernels. We
then demonstrate that, for affine kernels in the
application, a purely compile-time approach to CPU
frequency and core count selection is achievable,
providing significant additional benefits over the
runtime approach. Our framework relies on a one-time
profiling of the target CPU, along with a compile-time
categorization of loop-based code segments in the
application. These are combined to determine at
compile-time the frequency and the number of cores to
use to execute each affine region to optimize energy or
energy-delay product. Extensive evaluation on 60
benchmarks and 5 multi-core CPUs show that our approach
systematically outperforms the powersave Linux governor
while also improving overall performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vale:2016:PDT,
author = "Tiago M. Vale and Jo{\~a}o A. Silva and Ricardo J.
Dias and Jo{\~a}o M. Louren{\c{c}}o",
title = "{Pot}: Deterministic Transactional Execution",
journal = j-TACO,
volume = "13",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3017993",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents Pot, a system that leverages the
concept of preordered transactions to achieve
deterministic multithreaded execution of programs that
use Transactional Memory. Preordered transactions
eliminate the root cause of nondeterminism in
transactional execution: they provide the illusion of
executing in a deterministic serial order, unlike
traditional transactions that appear to execute in a
nondeterministic order that can change from execution
to execution. Pot uses a new concurrency control
protocol that exploits the serialization order to
distinguish between fast and speculative transaction
execution modes in order to mitigate the overhead of
imposing a deterministic order. We build two Pot
prototypes: one using STM and another using
off-the-shelf HTM. To the best of our knowledge, Pot
enables deterministic execution of programs using
off-the-shelf HTM for the first time. An experimental
evaluation shows that Pot achieves deterministic
execution of TM programs with low overhead, sometimes
even outperforming nondeterministic executions, and
clearly outperforming the state of the art.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lu:2016:AFB,
author = "Zhonghai Lu and Yuan Yao",
title = "Aggregate Flow-Based Performance Fairness in {CMPs}",
journal = j-TACO,
volume = "13",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3014429",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In CMPs, multiple co-executing applications create
mutual interference when sharing the underlying
network-on-chip architecture. Such interference causes
different performance slowdowns to different
applications. To mitigate the unfairness problem, we
treat traffic initiated from the same thread as an
aggregate flow such that causal request/reply packet
sequences can be allocated to resources consistently
and fairly according to online profiled traffic
injection rates. Our solution comprises three coherent
mechanisms from rate profiling, rate inheritance, and
rate-proportional channel scheduling to facilitate and
realize unbiased workload-adaptive resource allocation.
Full-system evaluations in GEM5 demonstrate that,
compared to classic packet-centric and latest
application-prioritization approaches, our approach
significantly improves weighted speed-up for all
multi-application mixtures and achieves nearly ideal
performance fairness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Demir:2016:EPP,
author = "Yigit Demir and Nikos Hardavellas",
title = "Energy-Proportional Photonic Interconnects",
journal = j-TACO,
volume = "13",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3018110",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Photonic interconnects have emerged as the prime
candidate technology for efficient networks on chip at
future process nodes. However, the high optical loss of
many nanophotonic components coupled with the low
efficiency of current laser sources results in
exceedingly high total power requirements for the
laser. As optical interconnects stay on even during
periods of system inactivity, most of this power is
wasted, which has prompted research on laser gating.
Unfortunately, prior work has been complicated by the
long laser turn-on delays and has failed to deliver the
full savings. In this article, we propose ProLaser, a
laser control mechanism that monitors the requests sent
on the interconnect, the cache, and the coherence
directory to detect highly correlated events and turn
on proactively the lasers of a photonic interconnect.
While ProLaser requires fast lasers with a turn-on
delay of a few nanoseconds, a technology that is still
experimental, several types of such lasers that are
suitable for power gating have already been
manufactured over the last decade. Overall, ProLaser
saves 42\% to 85\% of the laser power, outperforms the
current state of the art by 2$ \times $ on average, and
closely tracks (within 2\%--6\%) a perfect prediction
scheme with full knowledge of future interconnect
requests. Moreover, the power savings of ProLaser allow
the cores to exploit a higher-power budget and run
faster, achieving speedups of 1.5 to 1.7$ \times $
(1.6$ \times $ on average).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kurt:2016:UAS,
author = "Mehmet Can Kurt and Sriram Krishnamoorthy and Gagan
Agrawal and Bin Ren",
title = "User-Assisted Store Recycling for Dynamic Task Graph
Schedulers",
journal = j-TACO,
volume = "13",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3018111",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The emergence of the multi-core era has led to
increased interest in designing effective yet practical
parallel programming models. Models based on task
graphs that operate on single-assignment data are
attractive in several ways. Notably, they can support
dynamic applications and precisely represent the
available concurrency. However, for efficient
execution, they also require nuanced algorithms for
scheduling and memory management. In this article, we
consider memory-efficient dynamic scheduling of task
graphs. Specifically, we present a novel approach for
dynamically recycling the memory locations assigned to
data items as they are produced by tasks. We develop
algorithms to identify memory-efficient store recycling
functions by systematically evaluating the validity of
a set of user-provided or automatically generated
alternatives. Because recycling functions can be input
data-dependent, we have also developed support for
continued correct execution of a task graph in the
presence of a potentially incorrect store recycling
function. Experimental evaluation demonstrates that
this approach to automatic store recycling incurs
little to no overheads, achieves memory usage
comparable to the best manually derived solutions,
often produces recycling functions valid across problem
sizes and input parameters, and efficiently recovers
from an incorrect choice of store recycling
functions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haj-Yihia:2016:FGP,
author = "Jawad Haj-Yihia and Ahmad Yasin and Yosi {Ben Asher}
and Avi Mendelson",
title = "Fine-Grain Power Breakdown of Modern Out-of-Order
Cores and Its Implications on {Skylake}-Based Systems",
journal = j-TACO,
volume = "13",
number = "4",
pages = "56:1--56:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3018112",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A detailed analysis of power consumption at low system
levels becomes important as a means for reducing the
overall power consumption of a system and its thermal
hot spots. This work presents a new power estimation
method that allows understanding the power breakdown of
an application when running on modern processor
architecture such as the newly released Intel Skylake
processor. This work also provides a detailed power and
performance characterization report for the SPEC
CPU2006 benchmarks, analysis of the data using
side-by-side power and performance breakdowns, as well
as few interesting case studies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Scolari:2016:SCP,
author = "Alberto Scolari and Davide Basilio Bartolini and Marco
Domenico Santambrogio",
title = "A Software Cache Partitioning System for Hash-Based
Caches",
journal = j-TACO,
volume = "13",
number = "4",
pages = "57:1--57:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3018113",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Contention on the shared Last-Level Cache (LLC) can
have a fundamental negative impact on the performance
of applications executed on modern multicores. An
interesting software approach to address LLC contention
issues is based on page coloring, which is a software
technique that attempts to achieve performance
isolation by partitioning a shared cache through
careful memory management. The key assumption of
traditional page coloring is that the cache is
physically addressed. However, recent multicore
architectures (e.g., Intel Sandy Bridge and later)
switched from a physical addressing scheme to a more
complex scheme that involves a hash function.
Traditional page coloring is ineffective on these
recent architectures. In this article, we extend page
coloring to work on these recent architectures by
proposing a mechanism able to handle their hash-based
LLC addressing scheme. Just as for traditional page
coloring, the goal of this new mechanism is to deliver
performance isolation by avoiding contention on the
LLC, thus enabling predictable performance. We
implement this mechanism in the Linux kernel, and
evaluate it using several benchmarks from the SPEC
CPU2006 and PARSEC 3.0 suites. Our results show that
our solution is able to deliver performance isolation
to concurrently running applications by enforcing
partitioning of a Sandy Bridge LLC, which traditional
page coloring techniques are not able to handle.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mukhanov:2017:AFG,
author = "Lev Mukhanov and Pavlos Petoumenos and Zheng Wang and
Nikos Parasyris and Dimitrios S. Nikolopoulos and
Bronis R. {De Supinski} and Hugh Leather",
title = "{ALEA}: a Fine-Grained Energy Profiling Tool",
journal = j-TACO,
volume = "14",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3050436",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Energy efficiency is becoming increasingly important,
yet few developers understand how source code changes
affect the energy and power consumption of their
programs. To enable them to achieve energy savings, we
must associate energy consumption with software
structures, especially at the fine-grained level of
functions and loops. Most research in the field relies
on direct power/energy measurements taken from on-board
sensors or performance counters. However, this coarse
granularity does not directly provide the needed
fine-grained measurements. This article presents ALEA,
a novel fine-grained energy profiling tool based on
probabilistic analysis for fine-grained energy
accounting. ALEA overcomes the limitations of
coarse-grained power-sensing instruments to associate
energy information effectively with source code at a
fine-grained level. We demonstrate and validate that
ALEA can perform accurate energy profiling at various
granularity levels on two different architectures:
Intel Sandy Bridge and ARM big.LITTLE. ALEA achieves a
worst-case error of only 2\% for coarse-grained code
structures and 6\% for fine-grained ones, with less
than 1\% runtime overhead. Our use cases demonstrate
that ALEA supports energy optimizations, with energy
savings of up to 2.87 times for a latency-critical
option pricing workload under a given power budget.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pathania:2017:DTM,
author = "Anuj Pathania and Vanchinathan Venkataramani and
Muhammad Shafique and Tulika Mitra and J{\"o}rg
Henkel",
title = "Defragmentation of Tasks in Many-Core Architecture",
journal = j-TACO,
volume = "14",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3050437",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many-cores can execute multiple multithreaded tasks in
parallel. A task performs most efficiently when it is
executed over a spatially connected and compact subset
of cores so that performance loss due to communication
overhead imposed by the task's threads spread across
the allocated cores is minimal. Over a span of time,
unallocated cores can get scattered all over the
many-core, creating fragments in the task mapping.
These fragments can prevent efficient contiguous
mapping of incoming new tasks leading to loss of
performance. This problem can be alleviated by using a
task defragmenter, which consolidates smaller fragments
into larger fragments wherein the incoming tasks can be
efficiently executed. Optimal defragmentation of a
many-core is an NP-hard problem in the general case.
Therefore, we simplify the original problem to a
problem that can be solved optimally in polynomial
time. In this work, we introduce a concept of
exponentially separable mapping (ESM), which defines a
set of task mapping constraints on a many-core. We
prove that an ESM enforcing many-core can be
defragmented optimally in polynomial time.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zivanovic:2017:MMH,
author = "Darko Zivanovic and Milan Pavlovic and Milan Radulovic
and Hyunsung Shin and Jongpil Son and Sally A. Mckee
and Paul M. Carpenter and Petar Radojkovi{\'c} and
Eduard Ayguad{\'e}",
title = "Main Memory in {HPC}: Do We Need More or Could We Live
with Less?",
journal = j-TACO,
volume = "14",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3023362",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "An important aspect of High-Performance Computing
(HPC) system design is the choice of main memory
capacity. This choice becomes increasingly important
now that 3D-stacked memories are entering the market.
Compared with conventional Dual In-line Memory Modules
(DIMMs), 3D memory chiplets provide better performance
and energy efficiency but lower memory capacities.
Therefore, the adoption of 3D-stacked memories in the
HPC domain depends on whether we can find use cases
that require much less memory than is available now.
This study analyzes the memory capacity requirements of
important HPC benchmarks and applications. We find that
the High-Performance Conjugate Gradients (HPCG)
benchmark could be an important success story for
3D-stacked memories in HPC, but High-Performance
Linpack (HPL) is likely to be constrained by 3D memory
capacity. The study also emphasizes that the analysis
of memory footprints of production HPC applications is
complex and that it requires an understanding of
application scalability and target category, i.e.,
whether the users target capability or capacity
computing. The results show that most of the HPC
applications under study have per-core memory
footprints in the range of hundreds of megabytes, but
we also detect applications and use cases that require
gigabytes per core. Overall, the study identifies the
HPC applications and use cases with memory footprints
that could be provided by 3D-stacked memory chiplets,
making a first step toward adoption of this novel
technology in the HPC domain.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zheng:2017:WAD,
author = "Wenguang Zheng and Hui Wu and Qing Yang",
title = "{WCET}-Aware Dynamic {I}-Cache Locking for a Single
Task",
journal = j-TACO,
volume = "14",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046683",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Caches are widely used in embedded systems to bridge
the increasing speed gap between processors and
off-chip memory. However, caches make it significantly
harder to compute the worst-case execution time (WCET)
of a task. To alleviate this problem, cache locking has
been proposed. We investigate the WCET-aware I-cache
locking problem and propose a novel dynamic I-cache
locking heuristic approach for reducing the WCET of a
task. For a nonnested loop, our approach aims at
selecting a minimum set of memory blocks of the loop as
locked cache contents by using the min-cut algorithm.
For a loop nest, our approach not only aims at
selecting a minimum set of memory blocks of the loop
nest as locked cache contents but also finds a good
loading point for each selected memory block. We
propose two algorithms for finding a good loading point
for each selected memory block, a polynomial-time
heuristic algorithm and an integer linear programming
(ILP)-based algorithm, further reducing the WCET of
each loop nest. We have implemented our approach and
compared it to two state-of-the-art I-cache locking
approaches by using a set of benchmarks from the MRTC
benchmark suite. The experimental results show that the
polynomial-time heuristic algorithm for finding a good
loading point for each selected memory block performs
almost equally as well as the ILP-based algorithm.
Compared to the partial locking approach proposed in
Ding et al. [2012], our approach using the heuristic
algorithm achieves the average improvements of 33\%,
15\%, 9\%, 3\%, 8\%, and 11\% for the 256B, 512B, 1KB,
4KB, 8KB, and 16KB caches, respectively. Compared to
the dynamic locking approach proposed in Puaut [2006],
it achieves the average improvements of 9\%, 19\%,
18\%, 5\%, 11\%, and 16\% for the 256B, 512B, 1KB, 4KB,
8KB, and 16KB caches, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2017:EJV,
author = "Byung-Sun Yang and Jae-Yun Kim and Soo-Mook Moon",
title = "Exceptionization: a {Java} {VM} Optimization for
Non-{Java} Languages",
journal = j-TACO,
volume = "14",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046681",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Java virtual machine (JVM) has recently evolved into a
general-purpose language runtime environment to execute
popular programming languages such as JavaScript, Ruby,
Python, and Scala. These languages have complex
non-Java features, including dynamic typing and
first-class function, so additional language runtimes
(engines) are provided on top of the JVM to support
them with bytecode extensions. Although there are
high-performance JVMs with powerful just-in-time (JIT)
compilers, running these languages efficiently on the
JVM is still a challenge. This article introduces a
simple and novel technique for the JVM JIT compiler
called exceptionization to improve the performance of
JVM-based language runtimes. We observed that the JVM
executing some non-Java languages encounters at least 2
times more branch bytecodes than Java, most of which
are highly biased to take only one target.
Exceptionization treats such a highly biased branch as
some implicit exception-throwing instruction. This
allows the JVM JIT compiler to prune the infrequent
target of the branch from the frequent control flow,
thus compiling the frequent control flow more
aggressively with better optimization. If a pruned path
were taken, then it would run like a Java exception
handler, that is, a catch block. We also devised
de-exceptionization, a mechanism to cope with the case
when a pruned path is executed more often than
expected. Since exceptionization is a generic JVM
optimization, independent of any specific language
runtime, it would be generally applicable to other
language runtimes on the JVM. Our experimental result
shows that exceptionization accelerates the performance
of several non-Java languages. For example,
JavaScript-on-JVM runs faster by as much as 60\% and by
6\% on average, when experimented with the Octane
benchmark suite on Oracle's latest Nashorn JavaScript
engine and HotSpot 1.9 JVM. Furthermore, the
performance of Ruby-on-JVM shows an improvement by as
much as 60\% and by 6\% on average, while Python-on-JVM
improves by as much as 6\% and by 2\% on average. We
found that exceptionization is more effective to apply
to the branch bytecode of the language runtime itself
than the bytecode corresponding to the application code
or the bytecode of the Java class libraries. This
implies that the performance benefit of
exceptionization comes from better JIT compilation of
the language runtime of non-Java languages.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sen:2017:PGE,
author = "Rathijit Sen and David A. Wood",
title = "{Pareto} Governors for Energy-Optimal Computing",
journal = j-TACO,
volume = "14",
number = "1",
pages = "6:1--6:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046682",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The original definition of energy-proportional
computing does not characterize the energy efficiency
of recent reconfigurable computers, resulting in
nonintuitive ``super-proportional'' behavior. This
article introduces a new definition of ideal
energy-proportional computing, new metrics to quantify
computational energy waste, and new SLA-aware OS
governors that seek Pareto optimality to achieve
power-efficient performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chaudhuri:2017:MSC,
author = "Mainak Chaudhuri and Mukesh Agrawal and Jayesh Gaur
and Sreenivas Subramoney",
title = "Micro-Sector Cache: Improving Space Utilization in
Sectored {DRAM} Caches",
journal = j-TACO,
volume = "14",
number = "1",
pages = "7:1--7:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046680",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent research proposals on DRAM caches with
conventional allocation units (64 or 128 bytes) as well
as large allocation units (512 bytes to 4KB) have
explored ways to minimize the space/latency impact of
the tag store and maximize the effective utilization of
the bandwidth. In this article, we study sectored DRAM
caches that exercise large allocation units called
sectors, invest reasonably small storage to maintain
tag/state, enable space- and bandwidth-efficient
tag/state caching due to low tag working set size and
large data coverage per tag element, and minimize main
memory bandwidth wastage by fetching only the useful
portions of an allocated sector. However, the sectored
caches suffer from poor space utilization, since a
large sector is always allocated even if the sector
utilization is low. The recently proposed Unison cache
addresses only a special case of this problem by not
allocating the sectors that have only one active block.
We propose Micro-sector cache, a locality-aware
sectored DRAM cache architecture that features a
flexible mechanism to allocate cache blocks within a
sector and a locality-aware sector replacement
algorithm. Simulation studies on a set of 30 16-way
multi-programmed workloads show that our proposal, when
incorporated in an optimized Unison cache baseline,
improves performance (weighted speedup) by 8\%, 14\%,
and 16\% on average, respectively, for 1KB, 2KB, and
4KB sectors at 128MB capacity. These performance
improvements result from significantly better cache
space utilization, leading to 18\%, 21\%, and 22\%
average reduction in DRAM cache read misses,
respectively, for 1KB, 2KB, and 4KB sectors at 128MB
capacity. We evaluate our proposal for DRAM cache
capacities ranging from 128MB to 1GB.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Georgiou:2017:ETD,
author = "Kyriakos Georgiou and Steve Kerrison and Zbigniew
Chamski and Kerstin Eder",
title = "Energy Transparency for Deeply Embedded Programs",
journal = j-TACO,
volume = "14",
number = "1",
pages = "8:1--8:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046679",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Energy transparency is a concept that makes a
program's energy consumption visible, from hardware up
to software, through the different system layers. Such
transparency can enable energy optimizations at each
layer and between layers, as well as help both
programmers and operating systems make energy-aware
decisions. In this article, we focus on deeply embedded
devices, typically used for Internet of Things (IoT)
applications, and demonstrate how to enable energy
transparency through existing static resource analysis
(SRA) techniques and a new target-agnostic profiling
technique, without hardware energy measurements. Our
novel mapping technique enables software energy
consumption estimations at a higher level than the
Instruction Set Architecture (ISA), namely the LLVM
intermediate representation (IR) level, and therefore
introduces energy transparency directly to the LLVM
optimizer. We apply our energy estimation techniques to
a comprehensive set of benchmarks, including single-
and multithreaded embedded programs from two commonly
used concurrency patterns: task farms and pipelines.
Using SRA, our LLVM IR results demonstrate a high
accuracy with a deviation in the range of 1\% from the
ISA SRA. Our profiling technique captures the actual
energy consumption at the LLVM IR level with an average
error of 3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2017:LLO,
author = "Pengcheng Li and Xiaoyu Hu and Dong Chen and Jacob
Brock and Hao Luo and Eddy Z. Zhang and Chen Ding",
title = "{LD}: Low-Overhead {GPU} Race Detection Without Access
Monitoring",
journal = j-TACO,
volume = "14",
number = "1",
pages = "9:1--9:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046678",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Data race detection has become an important problem in
GPU programming. Previous designs of CPU race-checking
tools are mainly task parallel and incur high overhead
on GPUs due to access instrumentation, especially when
monitoring many thousands of threads routinely used by
GPU programs. This article presents a novel
data-parallel solution designed and optimized for the
GPU architecture. It includes compiler support and a
set of runtime techniques. It uses value-based
checking, which detects the races reported in previous
work, finds new races, and supports race-free
deterministic GPU execution. More important, race
checking is massively data parallel and does not
introduce divergent branching or atomic
synchronization. Its slowdown is less than $ 5 \times $
for over half of the tests and $ 10 \times $ on
average, which is orders of magnitude more efficient
than the cuda-memcheck tool by Nvidia and the methods
that use fine-grained access instrumentation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Palangappa:2017:CCE,
author = "Poovaiah M. Palangappa and Kartik Mohanram",
title = "{CompEx++}: Compression-Expansion Coding for Energy,
Latency, and Lifetime Improvements in {MLC\slash TLC
NVMs}",
journal = j-TACO,
volume = "14",
number = "1",
pages = "10:1--10:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3050440",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multilevel/triple-level cell nonvolatile memories
(MLC/TLC NVMs) such as phase-change memory (PCM) and
resistive RAM (RRAM) are the subject of active research
and development as replacement candidates for DRAM,
which is limited by its high refresh power and poor
scaling potential. In addition to the benefits of
nonvolatility (low refresh power) and improved
scalability, MLC/TLC NVMs offer high data density and
memory capacity over DRAM. However, the viability of
MLC/TLC NVMs is limited primarily due to the high
programming energy and latency as well as the low
endurance of NVM cells; these are primarily attributed
to the iterative program-and-verify procedure necessary
for programming the NVM cells. This article proposes
compression-expansion (CompEx) coding, a low overhead
scheme that synergistically integrates pattern-based
compression with expansion coding to realize
simultaneous energy, latency, and lifetime improvements
in MLC/TLC NVMs. CompEx coding is agnostic to the
choice of compression technique; in this work, we
evaluate CompEx coding using both frequent pattern
compression (FPC) and base-delta-immediate $ (B \Delta
I) $ compression. CompEx coding integrates FPC/$ B
\Delta I $ with $ (k, m)_q $ ``expansion'' coding;
expansion codes are a class of $q$-ary linear block
codes that encode data using only the low energy states
of a $q$-ary NVM cell. CompEx coding simultaneously
reduces energy and latency and improves lifetime for
negligible-to-no memory overhead and negligible logic
overhead ( \approx 10k gates, which is $ < 0.1 \% $ per
NVM module). Furthermore, we also propose CompEx++
coding, which extends CompEx coding by leveraging the
variable compressibility of pattern-based compression
techniques. CompEx++ coding integrates custom expansion
codes to each of the compression patterns to exploit
maximum energy/latency benefits of CompEx coding. Our
full-system simulations using TLC RRAM show that
CompEx/CompEx++ coding reduces total memory energy by
57\%/61\% and write latency by 23.5\%/26\%; these
improvements translate to a 5.7\%/10.6\% improvement in
IPC, a 11.8\%/19.9\% improvement in main memory
bandwidth, and $ 1.8 \times $ improvement in lifetime
over classical binary coding using data-comparison
write. CompEx/CompEx++ coding thus addresses the
programming energy/latency and lifetime challenges of
MLC/TLC NVMs that pose a serious technological
roadblock to their adoption in high-performance
computing systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2017:DBT,
author = "Dongwoo Lee and Sangheon Lee and Soojung Ryu and
Kiyoung Choi",
title = "Dirty-Block Tracking in a Direct-Mapped {DRAM} Cache
with Self-Balancing Dispatch",
journal = j-TACO,
volume = "14",
number = "2",
pages = "11:1--11:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3068460",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recently, processors have begun integrating 3D stacked
DRAMs with the cores on the same package, and there
have been several approaches to effectively utilizing
the on-package DRAMs as caches. This article presents
an approach that combines the previous approaches in a
synergistic way by devising a module called the
dirty-block tracker to maintain the dirtiness of each
block in a dirty region. The approach avoids
unnecessary tag checking for a write operation if the
corresponding block in the cache is not dirty. Our
simulation results show that the proposed technique
achieves a 10.3\% performance improvement on average
over the state-of-the-art DRAM cache technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Parasyris:2017:SAP,
author = "Konstantinos Parasyris and Vassilis Vassiliadis and
Christos D. Antonopoulos and Spyros Lalis and Nikolaos
Bellas",
title = "Significance-Aware Program Execution on Unreliable
Hardware",
journal = j-TACO,
volume = "14",
number = "2",
pages = "12:1--12:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3058980",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article introduces a significance-centric
programming model and runtime support that sets the
supply voltage in a multicore CPU to sub-nominal values
to reduce the energy footprint and provide mechanisms
to control output quality. The developers specify the
significance of application tasks respecting their
contribution to the output quality and provide check
and repair functions for handling faults. On a
multicore system, we evaluate five benchmarks using an
energy model that quantifies the energy reduction. When
executing the least-significant tasks unreliably, our
approach leads to 20\% CPU energy reduction with
respect to a reliable execution and has minimal quality
degradation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mendonca:2017:DAA,
author = "Gleison Mendon{\c{c}}a and Breno Guimar{\~a}es and
P{\'e}ricles Alves and M{\'a}rcio Pereira and Guido
Ara{\'u}jo and Fernando Magno Quint{\~a}o Pereira",
title = "{DawnCC}: Automatic Annotation for Data Parallelism
and Offloading",
journal = j-TACO,
volume = "14",
number = "2",
pages = "13:1--13:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084540",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Directive-based programming models, such as OpenACC
and OpenMP, allow developers to convert a sequential
program into a parallel one with minimum human
intervention. However, inserting pragmas into
production code is a difficult and error-prone task,
often requiring familiarity with the target program.
This difficulty restricts the ability of developers to
annotate code that they have not written themselves.
This article provides a suite of compiler-related
methods to mitigate this problem. Such techniques rely
on symbolic range analysis, a well-known static
technique, to achieve two purposes: populate source
code with data transfer primitives and to disambiguate
pointers that could hinder automatic parallelization
due to aliasing. We have materialized our ideas into a
tool, DawnCC, which can be used stand-alone or through
an online interface. To demonstrate its effectiveness,
we show how DawnCC can annotate the programs available
in PolyBench without any intervention from users. Such
annotations lead to speedups of over $ 100 \times $ in
an Nvidia architecture and over $ 50 \times $ in an ARM
architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Balasubramonian:2017:CNT,
author = "Rajeev Balasubramonian and Andrew B. Kahng and Naveen
Muralimanohar and Ali Shafiee and Vaishnav Srinivas",
title = "{CACTI 7}: New Tools for Interconnect Exploration in
Innovative Off-Chip Memories",
journal = j-TACO,
volume = "14",
number = "2",
pages = "14:1--14:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3085572",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Historically, server designers have opted for simple
memory systems by picking one of a few commoditized DDR
memory products. We are already witnessing a major
upheaval in the off-chip memory hierarchy, with the
introduction of many new memory
products-buffer-on-board, LRDIMM, HMC, HBM, and NVMs,
to name a few. Given the plethora of choices, it is
expected that different vendors will adopt different
strategies for their high-capacity memory systems,
often deviating from DDR standards and/or integrating
new functionality within memory systems. These
strategies will likely differ in their choice of
interconnect and topology, with a significant fraction
of memory energy being dissipated in I/O and data
movement. To make the case for memory interconnect
specialization, this paper makes three contributions.
First, we design a tool that carefully models I/O power
in the memory system, explores the design space, and
gives the user the ability to define new types of
memory interconnects/topologies. The tool is validated
against SPICE models, and is integrated into version 7
of the popular CACTI package. Our analysis with the
tool shows that several design parameters have a
significant impact on I/O power. We then use the tool
to help craft novel specialized memory system channels.
We introduce a new relay-on-board chip that partitions
a DDR channel into multiple cascaded channels. We show
that this simple change to the channel topology can
improve performance by 22\% for DDR DRAM and lower cost
by up to 65\% for DDR DRAM. This new architecture does
not require any changes to DIMMs, and it efficiently
supports hybrid DRAM/NVM systems. Finally, as an
example of a more disruptive architecture, we design a
custom DIMM and parallel bus that moves away from the
DDR3/DDR4 standards. To reduce energy and improve
performance, the baseline data channel is split into
three narrow parallel channels and the on-DIMM
interconnects are operated at a lower frequency. In
addition, this allows us to design a two-tier error
protection strategy that reduces data transfers on the
interconnect. This architecture yields a performance
improvement of 18\% and a memory power reduction of
23\%. The cascaded channel and narrow channel
architectures serve as case studies for the new tool
and show the potential for benefit from re-organizing
basic memory interconnects.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jatala:2017:SSG,
author = "Vishwesh Jatala and Jayvant Anantpur and Amey
Karkare",
title = "Scratchpad Sharing in {GPUs}",
journal = j-TACO,
volume = "14",
number = "2",
pages = "15:1--15:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3075619",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "General-Purpose Graphics Processing Unit (GPGPU)
applications exploit on-chip scratchpad memory
available in the Graphics Processing Units (GPUs) to
improve performance. The amount of thread level
parallelism (TLP) present in the GPU is limited by the
number of resident threads, which in turn depends on
the availability of scratchpad memory in its streaming
multiprocessor (SM). Since the scratchpad memory is
allocated at thread block granularity, part of the
memory may remain unutilized. In this article, we
propose architectural and compiler optimizations to
improve the scratchpad memory utilization. Our
approach, called Scratchpad Sharing, addresses
scratchpad under-utilization by launching additional
thread blocks in each SM. These thread blocks use
unutilized scratchpad memory and also share scratchpad
memory with other resident blocks. To improve the
performance of scratchpad sharing, we propose Owner
Warp First (OWF) scheduling that schedules warps from
the additional thread blocks effectively. The
performance of this approach, however, is limited by
the availability of the part of scratchpad memory that
is shared among thread blocks. We propose compiler
optimizations to improve the availability of shared
scratchpad memory. We describe an allocation scheme
that helps in allocating scratchpad variables such that
shared scratchpad is accessed for short duration. We
introduce a new hardware instruction, relssp, that when
executed releases the shared scratchpad memory.
Finally, we describe an analysis for optimal placement
of relssp instructions, such that shared scratchpad
memory is released as early as possible, but only after
its last use, along every execution path. We
implemented the hardware changes required for
scratchpad sharing and the relssp instruction using the
GPGPU-Sim simulator and implemented the compiler
optimizations in Ocelot framework. We evaluated the
effectiveness of our approach on 19 kernels from 3
benchmarks suites: CUDA-SDK, GPGPU-Sim, and Rodinia.
The kernels that under-utilize scratchpad memory show
an average improvement of 19\% and maximum improvement
of 92.17\% in terms of the number of instruction
executed per cycle when compared to the baseline
approach, without affecting the performance of the
kernels that are not limited by scratchpad memory.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ham:2017:DDS,
author = "Tae Jun Ham and Juan L. Arag{\'o}n and Margaret
Martonosi",
title = "Decoupling Data Supply from Computation for
Latency-Tolerant Communication in Heterogeneous
Architectures",
journal = j-TACO,
volume = "14",
number = "2",
pages = "16:1--16:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3075620",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In today's computers, heterogeneous processing is used
to meet performance targets at manageable power. In
adopting increased compute specialization, however, the
relative amount of time spent on communication
increases. System and software optimizations for
communication often come at the costs of increased
complexity and reduced portability. The Decoupled
Supply-Compute (DeSC) approach offers a way to attack
communication latency bottlenecks automatically, while
maintaining good portability and low complexity. Our
work expands prior Decoupled Access Execute techniques
with hardware/software specialization. For a range of
workloads, DeSC offers roughly 2 $ \times $ speedup,
and additional specialized compression optimizations
reduce traffic between decoupled units by 40\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stanic:2017:IVS,
author = "Milan Stanic and Oscar Palomar and Timothy Hayes and
Ivan Ratkovic and Adrian Cristal and Osman Unsal and
Mateo Valero",
title = "An Integrated Vector-Scalar Design on an In-Order
{ARM} Core",
journal = j-TACO,
volume = "14",
number = "2",
pages = "17:1--17:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3075618",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the low-end mobile processor market, power, energy,
and area budgets are significantly lower than in the
server/desktop/laptop/high-end mobile markets. It has
been shown that vector processors are a highly
energy-efficient way to increase performance; however,
adding support for them incurs area and power overheads
that would not be acceptable for low-end mobile
processors. In this work, we propose an integrated
vector-scalar design for the ARM architecture that
mostly reuses scalar hardware to support the execution
of vector instructions. The key element of the design
is our proposed block-based model of execution that
groups vector computational instructions together to
execute them in a coordinated manner. We implemented a
classic vector unit and compare its results against our
integrated design. Our integrated design improves the
performance (more than $ 6 \times $) and energy
consumption (up to $ 5 \times $) of a scalar in-order
core with negligible area overhead (only 4.7\% when
using a vector register with 32 elements). In contrast,
the area overhead of the classic vector unit can be
significant (around 44\%) if a dedicated vector
floating-point unit is incorporated. Our block-based
vector execution outperforms the classic vector unit
for all kernels with floating-point data and also
consumes less energy. We also complement the integrated
design with three energy/performance-efficient
techniques that further reduce power and increase
performance. The first proposal covers the design and
implementation of chaining logic that is optimized to
work with the cache hierarchy through vector memory
instructions, the second proposal reduces the number of
reads/writes from/to the vector register file, and the
third idea optimizes complex memory access patterns
with the memory shape instruction and unified indexed
vector load.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Endo:2017:IBV,
author = "Fernando A. Endo and Arthur Perais and Andr{\'e}
Seznec",
title = "On the Interactions Between Value Prediction and
Compiler Optimizations in the Context of {EOLE}",
journal = j-TACO,
volume = "14",
number = "2",
pages = "18:1--18:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3090634",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Increasing instruction-level parallelism is regaining
attractiveness within the microprocessor industry. The
{Early | Out-of-order | Late} Execution (EOLE)
microarchitecture and Differential Value TAgged
GEometric (D-VTAGE) value predictor were recently
introduced to solve practical issues of Value
Prediction (VP). In particular, they remove the most
significant difficulties that forbade an effective VP
hardware. In this study, we present a detailed
evaluation of the potential of VP in the context of
EOLE/D-VTAGE and different compiler options. Our study
shows that if no single general rule always
applies-more optimization might sometimes lead to more
performance-unoptimized codes often get a large benefit
from the prediction of redundant loads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sridharan:2017:BPP,
author = "Aswinkumar Sridharan and Biswabandan Panda and Andre
Seznec",
title = "Band-Pass Prefetching: an Effective Prefetch
Management Mechanism Using Prefetch-Fraction Metric in
Multi-Core Systems",
journal = j-TACO,
volume = "14",
number = "2",
pages = "19:1--19:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3090635",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In multi-core systems, an application's prefetcher can
interfere with the memory requests of other
applications using the shared resources, such as last
level cache and memory bandwidth. In order to minimize
prefetcher-caused interference, prior mechanisms have
been proposed to dynamically control prefetcher
aggressiveness at runtime. These mechanisms use several
parameters to capture prefetch usefulness as well as
prefetcher-caused interference, performing aggressive
control decisions. However, these mechanisms do not
capture the actual interference at the shared resources
and most often lead to incorrect aggressiveness control
decisions. Therefore, prior works leave scope for
performance improvement. Toward this end, we propose a
solution to manage prefetching in multicore systems. In
particular, we make two fundamental observations:
First, a positive correlation exists between the
accuracy of a prefetcher and the amount of prefetch
requests it generates relative to an application's
total (demand and prefetch) requests. Second, a strong
positive correlation exists between the ratio of total
prefetch to demand requests and the ratio of average
last level cache miss service times of demand to
prefetch requests. In this article, we propose
Band-pass prefetching that builds on those two
observations, a simple and low-overhead mechanism to
effectively manage prefetchers in multicore systems.
Our solution consists of local and global prefetcher
aggressiveness control components, which altogether,
control the flow of prefetch requests between a range
of prefetch to demand requests ratios. From our
experiments on 16-core multi-programmed workloads, on
systems using stream prefetching, we observe that
Band-pass prefetching achieves 12.4\% (geometric-mean)
improvement on harmonic speedup over the baseline that
implements no prefetching, while aggressive prefetching
without prefetcher aggressiveness control and
state-of-the-art HPAC, P-FST, and CAFFEINE achieve
8.2\%, 8.4\%, 1.4\%, and 9.7\%, respectively. Further
evaluation of the proposed Band-pass prefetching
mechanism on systems using AMPM prefetcher shows
similar performance trends. For a 16-core system,
Band-pass prefetching requires only a modest hardware
cost of 239 bytes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Goens:2017:SSS,
author = "Andr{\'e}s Goens and Sergio Siccha and Jeronimo
Castrillon",
title = "Symmetry in Software Synthesis",
journal = j-TACO,
volume = "14",
number = "2",
pages = "20:1--20:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3095747",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:59 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the surge of multi- and many-core systems, much
research has focused on algorithms for mapping and
scheduling on these complex platforms. Large classes of
these algorithms face scalability problems. This is why
diverse methods are commonly used for reducing the
search space. While most such approaches leverage the
inherent symmetry of architectures and applications,
they do it in a problem-specific and intuitive way.
However, intuitive approaches become impractical with
growing hardware complexity, like Network-on-Chip
interconnect or heterogeneous cores. In this article,
we present a formal framework that can determine the
inherent local and global symmetry of architectures and
applications algorithmically and leverage these for
problems in software synthesis. Our approach is based
on the mathematical theory of groups and a
generalization called inverse semigroups. We evaluate
our approach in two state-of-the-art mapping
frameworks. Even for the platforms with a handful of
cores of today and moderate-sized benchmarks, our
approach consistently yields reductions of the overall
execution time of algorithms. We obtain a speedup of
more than $ 10 \times $ for one use-case and saved 10\%
of time in another.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vocke:2017:EHI,
author = "Sander Vocke and Henk Corporaal and Roel Jordans and
Rosilde Corvino and Rick Nas",
title = "Extending {Halide} to Improve Software Development for
Imaging {DSPs}",
journal = j-TACO,
volume = "14",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106343",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Specialized Digital Signal Processors (DSPs), which
can be found in a wide range of modern devices, play an
important role in power-efficient, high-performance
image processing. Applications including camera sensor
post-processing and computer vision benefit from being
(partially) mapped onto such DSPs. However, due to
their specialized instruction sets and dependence on
low-level code optimization, developing applications
for DSPs is more time-consuming and error-prone than
for general-purpose processors. Halide is a
domain-specific language (DSL) that enables low-effort
development of portable, high-performance imaging
pipelines-a combination of qualities that is hard, if
not impossible, to find among DSP programming models.
We propose a set of extensions and modifications to
Halide to generate code for DSP C compilers, focusing
specifically on diverse SIMD target instruction sets
and heterogeneous scratchpad memory hierarchies. We
implement said techniques for a commercial DSP found in
an Intel Image Processing Unit (IPU), demonstrating
that this solution can be used to achieve performance
within 20\% of highly tuned, manually written C code,
while leading to a reduction in code complexity. By
comparing performance of Halide algorithms using our
solution to results on CPU and GPU targets, we confirm
the value of using DSP targets with Halide.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jensen:2017:ILD,
author = "Nicklas Bo Jensen and Sven Karlsson",
title = "Improving Loop Dependence Analysis",
journal = j-TACO,
volume = "14",
number = "3",
pages = "22:1--22:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3095754",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Programmers can no longer depend on new processors to
have significantly improved single-thread performance.
Instead, gains have to come from other sources such as
the compiler and its optimization passes. Advanced
passes make use of information on the dependencies
related to loops. We improve the quality of that
information by reusing the information given by the
programmer for parallelization. We have implemented a
prototype based on GCC into which we also add a new
optimization pass. Our approach improves the amount of
correctly classified dependencies resulting in 46\%
average improvement in single-thread performance for
kernel benchmarks compared to GCC 6.1.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ganser:2017:ISO,
author = "Stefan Ganser and Armin Gr{\"o}sslinger and Norbert
Siegmund and Sven Apel and Christian Lengauer",
title = "Iterative Schedule Optimization for Parallelization in
the Polyhedron Model",
journal = j-TACO,
volume = "14",
number = "3",
pages = "23:1--23:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3109482",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The polyhedron model is a powerful model to identify
and apply systematically loop transformations that
improve data locality (e.g., via tiling) and enable
parallelization. In the polyhedron model, a loop
transformation is, essentially, represented as an
affine function. Well-established algorithms for the
discovery of promising transformations are based on
performance models. These algorithms have the drawback
of not being easily adaptable to the characteristics of
a specific program or target hardware. An iterative
search for promising loop transformations is more
easily adaptable and can help to learn better models.
We present an iterative optimization method in the
polyhedron model that targets tiling and
parallelization. The method enables either a sampling
of the search space of legal loop transformations at
random or a more directed search via a genetic
algorithm. For the latter, we propose a set of novel,
tailored reproduction operators. We evaluate our
approach against existing iterative and model-driven
optimization strategies. We compare the convergence
rate of our genetic algorithm to that of random
exploration. Our approach of iterative optimization
outperforms existing optimization techniques in that it
finds loop transformations that yield significantly
higher performance. If well configured, then random
exploration turns out to be very effective and reduces
the need for a genetic algorithm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wei:2017:HHM,
author = "Wei Wei and Dejun Jiang and Jin Xiong and Mingyu
Chen",
title = "{HAP}: Hybrid-Memory-Aware Partition in Shared
Last-Level Cache",
journal = j-TACO,
volume = "14",
number = "3",
pages = "24:1--24:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106340",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Data-center servers benefit from large-capacity memory
systems to run multiple processes simultaneously.
Hybrid DRAM-NVM memory is attractive for increasing
memory capacity by exploiting the scalability of
Non-Volatile Memory (NVM). However, current LLC
policies are unaware of hybrid memory. Cache misses to
NVM introduce high cost due to long NVM latency.
Moreover, evicting dirty NVM data suffer from long
write latency. We propose hybrid memory aware cache
partitioning to dynamically adjust cache spaces and
give NVM dirty data more chances to reside in LLC.
Experimental results show Hybrid-memory-Aware Partition
(HAP) improves performance by 46.7\% and reduces energy
consumption by 21.9\% on average against LRU
management. Moreover, HAP averagely improves
performance by 9.3\% and reduces energy consumption by
6.4\% against a state-of-the-art cache mechanism.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xiong:2017:PPP,
author = "Dongliang Xiong and Kai Huang and Xiaowen Jiang and
Xiaolang Yan",
title = "Providing Predictable Performance via a Slowdown
Estimation Model",
journal = j-TACO,
volume = "14",
number = "3",
pages = "25:1--25:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3124451",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Interapplication interference at shared main memory
slows down different applications differently. A few
slowdown estimation models have been proposed to
provide predictable performance by quantifying memory
interference, but they have relatively low accuracy.
Thus, we propose a more accurate slowdown estimation
model called SEM at main memory. First, SEM unifies the
slowdown estimation model by measuring IPC directly.
Second, SEM uses the per-bank structure to monitor
memory interference and improves estimation accuracy by
considering write interference, row-buffer
interference, and data bus interference. The evaluation
results show that SEM has significantly lower slowdown
estimation error (4.06\%) compared to STFM (30.15\%)
and MISE (10.1\%).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pu:2017:PHS,
author = "Jing Pu and Steven Bell and Xuan Yang and Jeff Setter
and Stephen Richardson and Jonathan Ragan-Kelley and
Mark Horowitz",
title = "Programming Heterogeneous Systems from an Image
Processing {DSL}",
journal = j-TACO,
volume = "14",
number = "3",
pages = "26:1--26:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3107953",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Specialized image processing accelerators are
necessary to deliver the performance and energy
efficiency required by important applications in
computer vision, computational photography, and
augmented reality. But creating, ``programming,'' and
integrating this hardware into a hardware/software
system is difficult. We address this problem by
extending the image processing language Halide so users
can specify which portions of their applications should
become hardware accelerators, and then we provide a
compiler that uses this code to automatically create
the accelerator along with the ``glue'' code needed for
the user's application to access this hardware.
Starting with Halide not only provides a very
high-level functional description of the hardware but
also allows our compiler to generate a complete
software application, which accesses the hardware for
acceleration when appropriate. Our system also provides
high-level semantics to explore different mappings of
applications to a heterogeneous system, including the
flexibility of being able to change the throughput rate
of the generated hardware. We demonstrate our approach
by mapping applications to a commercial Xilinx Zynq
system. Using its FPGA with two low-power ARM cores,
our design achieves up to 6$ \times $ higher
performance and 38$ \times $ lower energy compared to
the quad-core ARM CPU on an NVIDIA Tegra K1, and 3.5$
\times $ higher performance with 12$ \times $ lower
energy compared to the K1's 192-core GPU.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hroub:2017:EGC,
author = "Ayman Hroub and M. E. S. Elrabaa and M. F. Mudawar and
A. Khayyat",
title = "Efficient Generation of Compact Execution Traces for
Multicore Architectural Simulations",
journal = j-TACO,
volume = "14",
number = "3",
pages = "27:1--27:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106342",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Requiring no functional simulation, trace-driven
simulation has the potential of achieving faster
simulation speeds than execution-driven simulation of
multicore architectures. An efficient, on-the-fly,
high-fidelity trace generation method for multithreaded
applications is reported. The generated trace is
encoded in an instruction-like binary format that can
be directly ``interpreted'' by a timing simulator to
simulate a general load/store or x8-like architecture.
A complete tool suite that has been developed and used
for evaluation of the proposed method showed that it
produces smaller traces over existing trace compression
methods while retaining good fidelity including all
threading- and synchronization-related events.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Weber:2017:MAL,
author = "Nicolas Weber and Michael Goesele",
title = "{MATOG}: Array Layout Auto-Tuning for {CUDA}",
journal = j-TACO,
volume = "14",
number = "3",
pages = "28:1--28:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106341",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Optimal code performance is (besides correctness and
accuracy) the most important objective in compute
intensive applications. In many of these applications,
Graphic Processing Units (GPUs) are used because of
their high amount of compute power. However, caused by
their massively parallel architecture, the code has to
be specifically adjusted to the underlying hardware to
achieve optimal performance and therefore has to be
reoptimized for each new generation. In reality, this
is usually not the case as productive code is normally
at least several years old and nobody has the time to
continuously adjust existing code to new hardware. In
recent years more and more approaches have emerged that
automatically tune the performance of applications
toward the underlying hardware. In this article, we
present the MATOG auto-tuner and its concepts. It
abstracts the array memory access in CUDA applications
and automatically optimizes the code according to the
used GPUs. MATOG only requires few profiling runs to
analyze even complex applications, while achieving
significant speedups over non-optimized code,
independent of the used GPU generation and without the
need to manually tune the code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ashouri:2017:MMC,
author = "Amir H. Ashouri and Andrea Bignoli and Gianluca
Palermo and Cristina Silvano and Sameer Kulkarni and
John Cavazos",
title = "{MiCOMP}: Mitigating the Compiler Phase-Ordering
Problem Using Optimization Sub-Sequences and Machine
Learning",
journal = j-TACO,
volume = "14",
number = "3",
pages = "29:1--29:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3124452",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent compilers offer a vast number of multilayered
optimizations targeting different code segments of an
application. Choosing among these optimizations can
significantly impact the performance of the code being
optimized. The selection of the right set of compiler
optimizations for a particular code segment is a very
hard problem, but finding the best ordering of these
optimizations adds further complexity. Finding the best
ordering represents a long standing problem in
compilation research, named the phase-ordering problem.
The traditional approach of constructing compiler
heuristics to solve this problem simply cannot cope
with the enormous complexity of choosing the right
ordering of optimizations for every code segment in an
application. This article proposes an automatic
optimization framework we call MiCOMP, which Mitigates
the COMpiler Phase-ordering problem. We perform phase
ordering of the optimizations in LLVM's highest
optimization level using optimization sub-sequences and
machine learning. The idea is to cluster the
optimization passes of LLVM's O3 setting into different
clusters to predict the speedup of a complete sequence
of all the optimization clusters instead of having to
deal with the ordering of more than 60 different
individual optimizations. The predictive model uses (1)
dynamic features, (2) an encoded version of the
compiler sequence, and (3) an exploration heuristic to
tackle the problem. Experimental results using the LLVM
compiler framework and the Cbench suite show the
effectiveness of the proposed clustering and encoding
techniques to application-based reordering of passes,
while using a number of predictive models. We perform
statistical analysis on the results and compare against
(1) random iterative compilation, (2) standard
optimization levels, and (3) two recent prediction
approaches. We show that MiCOMP's iterative compilation
using its sub-sequences can reach an average
performance speedup of 1.31 (up to 1.51). Additionally,
we demonstrate that MiCOMP's prediction model
outperforms the -O1, -O2, and -O3 optimization levels
within using just a few predictions and reduces the
prediction error rate down to only 5\%. Overall, it
achieves 90\% of the available speedup by exploring
less than 0.001\% of the optimization space.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vermij:2017:AIN,
author = "Erik Vermij and Leandro Fiorin and Rik Jongerius and
Christoph Hagleitner and Jan {Van Lunteren} and Koen
Bertels",
title = "An Architecture for Integrated Near-Data Processors",
journal = j-TACO,
volume = "14",
number = "3",
pages = "30:1--30:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3127069",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To increase the performance of data-intensive
applications, we present an extension to a CPU
architecture that enables arbitrary near-data
processing capabilities close to the main memory. This
is realized by introducing a component attached to the
CPU system-bus and a component at the memory side.
Together they support hardware-managed coherence and
virtual memory support to integrate the near-data
processors in a shared-memory environment. We present
an implementation of the components, as well as a
system-simulator, providing detailed performance
estimations. With a variety of synthetic workloads we
demonstrate the performance of the memory accesses, the
mixed fine- and coarse-grained coherence mechanisms,
and the near-data processor communication mechanism.
Furthermore, we quantify the inevitable start-up
penalty regarding coherence and data writeback, and
argue that near-data processing workloads should access
data several times to offset this penalty. A case study
based on the Graph500 benchmark confirms the small
overhead for the proposed coherence mechanisms and
shows the ability to outperform a real CPU by a factor
of two.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Diavastos:2017:SLR,
author = "Andreas Diavastos and Pedro Trancoso",
title = "{SWITCHES}: a Lightweight Runtime for Dataflow
Execution of Tasks on Many-Cores",
journal = j-TACO,
volume = "14",
number = "3",
pages = "31:1--31:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3127068",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "SWITCHES is a task-based dataflow runtime that
implements a lightweight distributed triggering system
for runtime dependence resolution and uses static
scheduling and compile-time assignment policies to
reduce runtime overheads. Unlike other systems, the
granularity of loop-tasks can be increased to favor
data-locality, even when having dependences across
different loops. SWITCHES introduces explicit task
resource allocation mechanisms for efficient allocation
of resources and adopts the latest OpenMP Application
Programming Interface (API), as to maintain high levels
of programming productivity. It provides a
source-to-source tool that automatically produces
thread-based code. Performance on an Intel Xeon-Phi
shows good scalability and surpasses OpenMP by an
average of 32\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jain:2017:CMA,
author = "Rahul Jain and Preeti Ranjan Panda and Sreenivas
Subramoney",
title = "Cooperative Multi-Agent Reinforcement Learning-Based
Co-optimization of Cores, Caches, and On-chip Network",
journal = j-TACO,
volume = "14",
number = "4",
pages = "32:1--32:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3132170",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern multi-core systems provide huge computational
capabilities, which can be used to run multiple
processes concurrently. To achieve the best possible
performance within limited power budgets, the various
system resources need to be allocated effectively. Any
mismatch between runtime resource requirement and
allocation leads to a sub-optimal energy-delay product
(EDP). Different optimization techniques exist for
addressing the problem of mismatch between the dynamic
requirement and runtime allocation of the system
resources. Choosing between multiple optimizations at
runtime is complex due to the non-additive effects,
making the scenario suitable for the application of
machine learning techniques. We present a novel method,
Machine Learned Machines (MLM), by using online
reinforcement learning (RL) to perform dynamic
partitioning of the last level cache (LLC), along with
dynamic voltage and frequency scaling (DVFS) of the
core and uncore (interconnection network and LLC). We
have proposed and evaluated three different MLM
co-optimization techniques based on independent and
cooperative multi-agent learners. We show that the
co-optimization results in a much lower system EDP than
any of the techniques applied individually. We explore
various RL models targeted toward optimization of
different system metrics and study their effects on a
system EDP, system throughput (STP), and Fairness. The
various proposed techniques have been extensively
evaluated with a mix of 20 workloads on a 4-core system
using Spec2006 benchmarks. We have further evaluated
our cooperative MLM techniques on a 16-core system. The
results show an average of 20.5\% and 19.1\% system EDP
improvement on a 4-core and 16-core system,
respectively, with limited degradation of STP and
Fairness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{DeSensi:2017:BPP,
author = "Daniele {De Sensi} and Tiziano {De Matteis} and
Massimo Torquati and Gabriele Mencagli and Marco
Danelutto",
title = "Bringing Parallel Patterns Out of the Corner: The
{P$^3$ARSEC} Benchmark Suite",
journal = j-TACO,
volume = "14",
number = "4",
pages = "33:1--33:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3132710",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-level parallel programming is an active research
topic aimed at promoting parallel programming
methodologies that provide the programmer with
high-level abstractions to develop complex parallel
software with reduced time to solution. Pattern-based
parallel programming is based on a set of composable
and customizable parallel patterns used as basic
building blocks in parallel applications. In recent
years, a considerable effort has been made in
empowering this programming model with features able to
overcome shortcomings of early approaches concerning
flexibility and performance. In this article, we
demonstrate that the approach is flexible and efficient
enough by applying it on 12 out of 13 PARSEC
applications. Our analysis, conducted on three
different multicore architectures, demonstrates that
pattern-based parallel programming has reached a good
level of maturity, providing comparable results in
terms of performance with respect to both other
parallel programming methodologies based on
pragma-based annotations (i.e., Open mp and OmpSs) and
native implementations (i.e., Pthreads). Regarding the
programming effort, we also demonstrate a considerable
reduction in lines of code and code churn compared to
Pthreads and comparable results with respect to other
existing implementations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ye:2017:CES,
author = "Chencheng Ye and Chen Ding and Hao Luo and Jacob Brock
and Dong Chen and Hai Jin",
title = "Cache Exclusivity and Sharing: Theory and
Optimization",
journal = j-TACO,
volume = "14",
number = "4",
pages = "34:1--34:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3134437",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A problem on multicore systems is cache sharing, where
the cache occupancy of a program depends on the cache
usage of peer programs. Exclusive cache hierarchy as
used on AMD processors is an effective solution to
allow processor cores to have a large private cache
while still benefitting from shared cache. The shared
cache stores the ``victims'' (i.e., data evicted from
private caches). The performance depends on how victims
of co-run programs interact in shared cache. This
article presents a new metric called the victim
footprint (VFP). It is measured once per program in its
solo execution and can then be combined to compute the
performance of any exclusive cache hierarchy, replacing
parallel testing with theoretical analysis. The work
evaluates the VFP by using it to analyze cache sharing
by parallel mixes of sequential programs, comparing the
accuracy of the theory to hardware counter results, and
measuring the benefit of exclusivity-aware analysis and
optimization.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shrivastava:2017:EEC,
author = "Rahul Shrivastava and V. Krishna Nandivada",
title = "Energy-Efficient Compilation of Irregular
Task-Parallel Loops",
journal = j-TACO,
volume = "14",
number = "4",
pages = "35:1--35:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3136063",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Energy-efficient compilation is an important problem
for multi-core systems. In this context, irregular
programs with task-parallel loops present interesting
challenges: the threads with lesser work-loads (
non-critical -threads) wait at the join-points for the
thread with maximum work-load ( critical -thread); this
leads to significant energy wastage. This problem
becomes more interesting in the context of
multi-socket-multi-core (MSMC) systems, where different
sockets may run at different frequencies, but all the
cores connected to a socket run at a single frequency.
In such a configuration, even though the load-imbalance
among the cores may be significant, an MSMC-oblivious
technique may miss the opportunities to reduce energy
consumption, if the load-imbalance across the sockets
is minimal. This problem becomes further challenging in
the presence of mutual-exclusion, where scaling the
frequencies of a socket executing the
non-critical-threads can impact the execution time of
the critical-threads. In this article, we propose a
scheme (X10Ergy) to obtain energy gains with minimal
impact on the execution time, for task-parallel
languages, such as X10, HJ, and so on. X10Ergy takes as
input a loop-chunked program (parallel-loop iterations
divided into chunks and each chunk is executed by a
unique thread). X10Ergy follows a mixed compile-time +
runtime approach that (i) uses static analysis to
efficiently compute the work-load of each chunk at
runtime, (ii) computes the ``remaining'' work-load of
the chunks running on the cores of each socket at
regular intervals and tunes the frequency of the
sockets accordingly, (iii) groups the threads into
different sockets (based on the remaining work-load of
their respective chunks), and (iv) in the presence of
atomic-blocks, models the effect of frequency-scaling
on the critical-thread. We implemented X10Ergy for X10
and have obtained encouraging results for the IMSuite
kernels.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Proy:2017:CAL,
author = "Julien Proy and Karine Heydemann and Alexandre Berzati
and Albert Cohen",
title = "Compiler-Assisted Loop Hardening Against Fault
Attacks",
journal = j-TACO,
volume = "14",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3141234",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Secure elements widely used in smartphones, digital
consumer electronics, and payment systems are subject
to fault attacks. To thwart such attacks, software
protections are manually inserted requiring experts and
time. The explosion of the Internet of Things (IoT) in
home, business, and public spaces motivates the
hardening of a wider class of applications and the need
to offer security solutions to non-experts. This
article addresses the automated protection of loops at
compilation time, covering the widest range of control-
and data-flow patterns, in both shape and complexity.
The security property we consider is that a sensitive
loop must always perform the expected number of
iterations; otherwise, an attack must be reported. We
propose a generic compile-time loop hardening scheme
based on the duplication of termination conditions and
of the computations involved in the evaluation of such
conditions. We also investigate how to preserve the
security property along the compilation flow while
enabling aggressive optimizations. We implemented this
algorithm in LLVM 4.0 at the Intermediate
Representation (IR) level in the backend. On average,
the compiler automatically hardens 95\% of the
sensitive loops of typical security benchmarks, and
98\% of these loops are shown to be robust to simulated
faults. Performance and code size overhead remain quite
affordable, at 12.5\% and 14\%, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Peterson:2017:TCT,
author = "Christina Peterson and Damian Dechev",
title = "A Transactional Correctness Tool for Abstract Data
Types",
journal = j-TACO,
volume = "14",
number = "4",
pages = "37:1--37:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3148964",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Transactional memory simplifies multiprocessor
programming by providing the guarantee that a
sequential block of code in the form of a transaction
will exhibit atomicity and isolation. Transactional
data structures offer the same guarantee to concurrent
data structures by enabling the atomic execution of a
composition of operations. The concurrency control of
transactional memory systems preserves atomicity and
isolation by detecting read/write conflicts among
multiple concurrent transactions. State-of-the-art
transactional data structures improve on this
concurrency control protocol by providing explicit
transaction-level synchronization for only
non-commutative operations. Since read/write conflicts
are handled by thread-level concurrency control, the
correctness of transactional data structures cannot be
evaluated according to the read/write histories. This
presents a challenge for existing correctness
verification techniques for transactional memory,
because correctness is determined according to the
transitions taken by the transactions in the presence
of read/write conflicts. In this article, we present
Transactional Correctness tool for Abstract Data Types
(TxC-ADT), the first tool that can check the
correctness of transactional data structures. TxC-ADT
elevates the standard definitions of transactional
correctness to be in terms of an abstract data type, an
essential aspect for checking correctness of
transactions that synchronize only for high-level
semantic conflicts. To accommodate a diverse assortment
of transactional correctness conditions, we present a
technique for defining correctness as a happens-before
relation. Defining a correctness condition in this
manner enables an automated approach in which
correctness is evaluated by generating and analyzing a
transactional happens-before graph during model
checking. A transactional happens-before graph is
maintained on a per-thread basis, making our approach
applicable to transactional correctness conditions that
do not enforce a total order on a transactional
execution. We demonstrate the practical applications of
TxC-ADT by checking Lock Free Transactional
Transformation and Transactional Data Structure
Libraries for serializability, strict serializability,
opacity, and causal consistency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ferroni:2017:PCM,
author = "Matteo Ferroni and Andrea Corna and Andrea Damiani and
Rolando Brondolin and Juan A. Colmenares and Steven
Hofmeyr and John D. Kubiatowicz and Marco D.
Santambrogio",
title = "Power Consumption Models for Multi-Tenant Server
Infrastructures",
journal = j-TACO,
volume = "14",
number = "4",
pages = "38:1--38:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3148965",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multi-tenant virtualized infrastructures allow cloud
providers to minimize costs through workload
consolidation. One of the largest costs is power
consumption, which is challenging to understand in
heterogeneous environments. We propose a power modeling
methodology that tackles this complexity using a
divide-and-conquer approach. Our results outperform
previous research work, achieving a relative error of
2\% on average and under 4\% in almost all cases.
Models are portable across similar architectures,
enabling predictions of power consumption before
migrating a tenant to a different hardware platform.
Moreover, we show the models allow us to evaluate
colocations of tenants to reduce overall consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mohammadi:2017:COE,
author = "Milad Mohammadi and Tor M. Aamodt and William J.
Dally",
title = "{CG-OoO}: Energy-Efficient Coarse-Grain Out-of-Order
Execution Near In-Order Energy with Near Out-of-Order
Performance",
journal = j-TACO,
volume = "14",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3151034",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We introduce the Coarse-Grain Out-of-Order (CG-OoO)
general-purpose processor designed to achieve close to
In-Order (InO) processor energy while maintaining
Out-of-Order (OoO) performance. CG-OoO is an
energy-performance-proportional architecture.
Block-level code processing is at the heart of this
architecture; CG-OoO speculates, fetches, schedules,
and commits code at block-level granularity. It
eliminates unnecessary accesses to energy-consuming
tables and turns large tables into smaller, distributed
tables that are cheaper to access. CG-OoO leverages
compiler-level code optimizations to deliver efficient
static code and exploits dynamic block-level and
instruction-level parallelism. CG-OoO introduces
Skipahead, a complexity effective, limited out-of-order
instruction scheduling model. Through the energy
efficiency techniques applied to the compiler and
processor pipeline stages, CG-OoO closes 62\% of the
average energy gap between the InO and OoO baseline
processors at the same area and nearly the same
performance as the OoO. This makes CG-OoO 1.8$ \times $
more efficient than the OoO on the energy-delay product
inverse metric. CG-OoO meets the OoO nominal
performance while trading off the peak scheduling
performance for superior energy efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Swami:2017:EEC,
author = "Shivam Swami and Poovaiah M. Palangappa and Kartik
Mohanram",
title = "{ECS}: Error-Correcting Strings for Lifetime
Improvements in Nonvolatile Memories",
journal = j-TACO,
volume = "14",
number = "4",
pages = "40:1--40:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3151083",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging nonvolatile memories (NVMs) suffer from low
write endurance, resulting in early cell failures (hard
errors), which reduce memory lifetime. It was
recognized early on that conventional error-correcting
codes (ECCs), which are designed for soft errors, are a
poor choice for addressing hard errors in NVMs. This
led to the evolution of hard error correction schemes
like dynamically replicated memory (DRM),
error-correcting pointers (ECPs), SAFER, FREE-p, PAYG,
and Zombie memory to improve NVM lifetime. Whereas
these approaches made significant inroads in addressing
hard errors and low memory lifetime in NVMs, overcoming
the challenges of underutilization of error-correcting
resources and/or implementation overhead (e.g., codec
latency, hardware support) remain areas of active
research and development. This article proposes
error-correcting strings (ECSs) as a high-utilization,
low-latency solution for hard error correction in
single-/multi-/triple-level cell (SLC/MLC/TLC) NVMs. At
its core, ECS adopts a base-offset approach to store
pointers to the failed memory cells; in this work, base
is the address of the first failed cell in a memory
block and offsets are the distances between successive
failed cells in that memory block. Unlike ECP, which
uses fixed-length pointers, ECS uses variable-length
offsets to point to the failed cells, thereby realizing
more pointers to tolerate more hard errors per memory
block. Further, this article proposes eXtended-ECS
(XECS), a page-level error correction architecture,
which employs dynamic on-demand ECS allocation and
opportunistic pattern-based data compression to improve
NVM lifetime by 2$ \times $ over ECP-6 for comparable
overhead and negligible impact to system performance.
Finally, this article demonstrates that ECS is a
drop-in replacement for ECP to extend the lifetime of
state-of-the-art ECP-based techniques like PAYG and
Zombie memory; ECS is also compatible with MLC/TLC
NVMs, where it complements drift-induced soft error
reduction techniques like ECC and incomplete data
mapping to simultaneously extend NVM lifetime.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Azhar:2017:SQS,
author = "M. Waqar Azhar and Per Stenstr{\"o}m and Vassilis
Papaefstathiou",
title = "{SLOOP}: {QoS}-Supervised Loop Execution to Reduce
Energy on Heterogeneous Architectures",
journal = j-TACO,
volume = "14",
number = "4",
pages = "41:1--41:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3148053",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Most systems allocate computational resources to each
executing task without any actual knowledge of the
application's Quality-of-Service (QoS) requirements.
Such best-effort policies lead to overprovisioning of
the resources and increase energy loss. This work
assumes applications with soft QoS requirements and
exploits the inherent timing slack to minimize the
allocated computational resources to reduce energy
consumption. We propose a lightweight progress-tracking
methodology based on the outer loops of application
kernels. It builds on online history and uses it to
estimate the total execution time. The prediction of
the execution time and the QoS requirements are then
used to schedule the application on a heterogeneous
architecture with big out-of-order cores and small
(LITTLE) in-order cores and select the minimum
operating frequency, using DVFS, that meets the
deadline. Our scheme is effective in exploiting the
timing slack of each application. We show that it can
reduce the energy consumption by more than 20\% without
missing any computational deadlines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kanakagiri:2017:MMD,
author = "Raghavendra Kanakagiri and Biswabandan Panda and Madhu
Mutyam",
title = "{MBZip}: Multiblock Data Compression",
journal = j-TACO,
volume = "14",
number = "4",
pages = "42:1--42:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3151033",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compression techniques at the last-level cache and the
DRAM play an important role in improving system
performance by increasing their effective capacities. A
compressed block in DRAM also reduces the transfer time
over the memory bus to the caches, reducing the latency
of a LLC cache miss. Usually, compression is achieved
by exploiting data patterns present within a block. But
applications can exhibit data locality that spread
across multiple consecutive data blocks. We observe
that there is significant opportunity available for
compressing multiple consecutive data blocks into one
single block, both at the LLC and DRAM. Our studies
using 21 SPEC CPU applications show that, at the LLC,
around 25\% (on average) of the cache blocks can be
compressed into one single cache block when grouped
together in groups of 2 to 8 blocks. In DRAM, more than
30\% of the columns residing in a single DRAM page can
be compressed into one DRAM column, when grouped
together in groups of 2 to 6. Motivated by these
observations, we propose a mechanism, namely, MBZip,
that compresses multiple data blocks into one single
block (called a zipped block), both at the LLC and
DRAM. At the cache, MBZip includes a simple tag
structure to index into these zipped cache blocks and
the indexing does not incur any redirectional delay. At
the DRAM, MBZip does not need any changes to the
address computation logic and works seamlessly with the
conventional/existing logic. MBZip is a synergistic
mechanism that coordinates these zipped blocks at the
LLC and DRAM. Further, we also explore silent writes at
the DRAM and show that certain writes need not access
the memory when blocks are zipped. MBZip improves the
system performance by 21.9\%, with a maximum of 90.3\%
on a 4-core system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Neill:2017:FAM,
author = "Richard Neill and Andi Drebes and Antoniu Pop",
title = "Fuse: Accurate Multiplexing of Hardware Performance
Counters Across Executions",
journal = j-TACO,
volume = "14",
number = "4",
pages = "43:1--43:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3148054",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Collecting hardware event counts is essential to
understanding program execution behavior. Contemporary
systems offer few Performance Monitoring Counters
(PMCs), thus only a small fraction of hardware events
can be monitored simultaneously. We present new
techniques to acquire counts for all available hardware
events with high accuracy by multiplexing PMCs across
multiple executions of the same program, then carefully
reconciling and merging the multiple profiles into a
single, coherent profile. We present a new metric for
assessing the similarity of statistical distributions
of event counts and show that our execution profiling
approach performs significantly better than Hardware
Event Multiplexing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sardashti:2017:CCG,
author = "Somayeh Sardashti and David A. Wood",
title = "Could Compression Be of General Use? {Evaluating}
Memory Compression across Domains",
journal = j-TACO,
volume = "14",
number = "4",
pages = "44:1--44:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3138805",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent proposals present compression as a
cost-effective technique to increase cache and memory
capacity and bandwidth. While these proposals show
potentials of compression, there are several open
questions to adopt these proposals in real systems
including the following: (1) Do these techniques work
for real-world workloads running for long time? (2)
Which application domains would potentially benefit the
most from compression? (3) At which level of memory
hierarchy should we apply compression: caches, main
memory, or both? In this article, our goal is to shed
light on some main questions on applicability of
compression. We evaluate compression in the memory
hierarchy for selected examples from different
application classes. We analyze real applications with
real data and complete runs of several benchmarks.
While simulators provide a pretty accurate framework to
study potential performance/energy impacts of ideas,
they mostly limit us to a small range of workloads with
short runtimes. To enable studying real workloads, we
introduce a fast and simple methodology to get samples
of memory and cache contents of a real machine (a
desktop or a server). Compared to a cycle-accurate
simulator, our methodology allows us to study real
workloads as well as benchmarks. Our toolset is not a
replacement for simulators but mostly complements them.
While we can use a simulator to measure
performance/energy impact of a particular compression
proposal, here with our methodology we can study the
potentials with long running workloads in early stages
of the design. Using our toolset, we evaluate a
collection of workloads from different domains, such as
a web server of CS department of UW-Madison for 24h,
Google Chrome (watching a 1h-long movie on YouTube),
and Linux games (playing for about an hour). We also
use several benchmarks from different domains,
including SPEC, mobile, and big data. We run these
benchmarks to completion. Using these workloads and our
toolset, we analyze different compression properties
for both real applications and benchmarks. We focus on
eight main hypotheses on compression, derived from
previous work on compression. These properties (Table
2) act as foundation of several proposals on
compression, so performance of those proposals depends
very much on these basic properties. Overall, our
results suggest that compression could be of general
use both in main memory and caches. On average, the
compression ratio is {$>$}=2 for 64\% and 54\% of
workloads, respectively, for memory and cache data. Our
evaluation indicates significant potential for both
cache and memory compression, with higher
compressibility in memory due to abundance of zero
blocks. Among application domains we studied, servers
show on average the highest compressibility, while our
mobile benchmarks show the lowest compressibility. For
comparing benchmarks with real workloads, we show that
(1) it is critical to run benchmarks to completion or
considerably long runtimes to avoid biased conclusions,
and (2) SPEC benchmarks are good representative of real
Desktop applications in terms of compressibility of
their datasets. However, this does not hold for all
compression properties. For example, SPEC benchmarks
have much better compression locality (i.e.,
neighboring blocks have similar compressibility) than
real workloads. Thus, it is critical for designers to
consider wider range of workloads, including real
applications, to evaluate their compression
techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2017:IEG,
author = "Libo Huang and Yashuai L{\"u} and Li Shen and Zhiying
Wang",
title = "Improving the Efficiency of {GPGPU} Work-Queue Through
Data Awareness",
journal = j-TACO,
volume = "14",
number = "4",
pages = "45:1--45:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3151035",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The architecture and programming model of current
GPGPUs are best suited for applications that are
dominated by structured control and data flows across
large regular datasets. Parallel workloads with
irregular control and data structures cannot easily
harness the processing power of the GPGPU. One approach
for mapping these irregular-parallel workloads to
GPGPUs is using work-queues. The work-queue approach
improves the utilization of SIMD units by only
processing useful works that are dynamically generated
during execution. As current GPGPUs lack necessary
supports for work-queues, a software-based work-queue
implementation often suffers from memory contention and
load balancing issues. In this article, we present a
novel hardware work-queue design named DaQueue, which
incorporates three data-aware features to improve the
efficiency of work-queues on GPGPUs. We evaluate our
proposal on the irregular-parallel workloads and carry
out a case study on a path tracing pipeline with a
cycle-level simulator. Experimental results show that
for the tested workloads, DaQueue improves performance
by 1.53$ \times $ on average and up to 1.91$ \times $.
Compared to a hardware worklist approach that is the
state-of-the-art prior work, DaQueue can achieve an
average of 33.92\% extra speedup with less hardware
area cost.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Angerd:2017:FAC,
author = "Alexandra Angerd and Erik Sintorn and Per
Stenstr{\"o}m",
title = "A Framework for Automated and Controlled
Floating-Point Accuracy Reduction in Graphics
Applications on {GPUs}",
journal = j-TACO,
volume = "14",
number = "4",
pages = "46:1--46:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3151032",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Reducing the precision of floating-point values can
improve performance and/or reduce energy expenditure in
computer graphics, among other, applications. However,
reducing the precision level of floating-point values
in a controlled fashion needs support both at the
compiler and at the microarchitecture level. At the
compiler level, a method is needed to automate the
reduction of precision of each floating-point value. At
the microarchitecture level, a lower precision of each
floating-point register can allow more floating-point
values to be packed into a register file. This,
however, calls for new register file organizations.
This article proposes an automated precision-selection
method and a novel GPU register file organization that
can store floating-point register values at arbitrary
precisions densely. The automated precision-selection
method uses a data-driven approach for setting the
precision level of floating-point values, given a
quality threshold and a representative set of input
data. By allowing a small, but acceptable, degradation
in output quality, our method can remove a significant
amount of the bits needed to represent floating-point
values in the investigated kernels (between 28\% and
60\%). Our proposed register file organization exploits
these lower-precision floating-point values by packing
several of them into the same physical register. This
reduces the register pressure per thread by up to 48\%,
and by 27\% on average, for a negligible output-quality
degradation. This can enable GPUs to keep up to twice
as many threads in flight simultaneously.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Arteaga:2017:GFG,
author = "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R.
Gao",
title = "Generating Fine-Grain Multithreaded Applications Using
a Multigrain Approach",
journal = j-TACO,
volume = "14",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3155288",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The recent evolution in hardware landscape, aimed at
producing high-performance computing systems capable of
reaching extreme-scale performance, has reignited the
interest in fine-grain multithreading, particularly at
the intranode level. Indeed, popular parallel
programming environments, such as OpenMP, which
features a simple interface for the parallelization of
programs, are now incorporating fine-grain constructs.
However, since coarse-grain directives are still
heavily used, the OpenMP runtime is forced to support
both coarse- and fine-grain models of execution,
potentially reducing the advantages obtained when
executing an application in a fully fine-grain
environment. To evaluate the type of applications that
benefit from executing in a unified fine-grain program
execution model, this article presents a multigrain
parallel programming environment for the generation of
fine-grain multithreaded applications from programs
featuring OpenMP's API, allowing OpenMP programs to be
run on top of a fine-grain event-driven program
execution model. Experimental results with five
scientific benchmarks show that fine-grain
applications, generated by and run on our environment
with two runtimes implementing a fine-grain
event-driven program execution model, are competitive
and can outperform their OpenMP counterparts,
especially for data-intensive workloads with irregular
and dynamic parallelism, reaching speedups as high as
2.6$ \times $ for Graph500 and 51$ \times $ for NAS
Data Cube.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hadidi:2017:CCA,
author = "Ramyad Hadidi and Lifeng Nai and Hyojong Kim and
Hyesoon Kim",
title = "{CAIRO}: a Compiler-Assisted Technique for Enabling
Instruction-Level Offloading of Processing-In-Memory",
journal = j-TACO,
volume = "14",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3155287",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Three-dimensional (3D)-stacking technology and the
memory-wall problem have popularized
processing-in-memory (PIM) concepts again, which offers
the benefits of bandwidth and energy savings by
offloading computations to functional units inside the
memory. Several memory vendors have also started to
integrate computation logics into the memory, such as
Hybrid Memory Cube (HMC), the latest version of which
supports up to 18 in-memory atomic instructions.
Although industry prototypes have motivated studies for
investigating efficient methods and architectures for
PIM, researchers have not proposed a systematic way for
identifying the benefits of instruction-level PIM
offloading. As a result, compiler support for
recognizing offloading candidates and utilizing
instruction-level PIM offloading is unavailable. In
this article, we analyze the advantages of
instruction-level PIM offloading in the context of
HMC-atomic instructions for graph-computing
applications and propose CAIRO, a compiler-assisted
technique and decision model for enabling
instruction-level offloading of PIM without any burden
on programmers. To develop CAIRO, we analyzed how
instruction offloading enables performance gain in both
CPU and GPU workloads. Our studies show that
performance gain from bandwidth savings, the ratio of
number of cache misses to total cache accesses, and the
overhead of host atomic instructions are the key
factors in selecting an offloading candidate. Based on
our analytical models, we characterize the properties
of beneficial and nonbeneficial candidates for
offloading. We evaluate CAIRO with 27 multithreaded CPU
and 36 GPU benchmarks. In our evaluation, CAIRO not
only doubles the speedup for a set of PIM-beneficial
workloads by exploiting HMC-atomic instructions but
also prevents slowdown caused by incorrect offloading
decisions for other workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lim:2017:TEP,
author = "Hongyeol Lim and Giho Park",
title = "{Triple Engine Processor (TEP)}: a Heterogeneous
Near-Memory Processor for Diverse Kernel Operations",
journal = j-TACO,
volume = "14",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3155920",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The advent of 3D memory stacking technology, which
integrates a logic layer and stacked memories, is
expected to be one of the most promising memory
technologies to mitigate the memory wall problem by
leveraging the concept of near-memory processing (NMP).
With the ability to process data locally within the
logic layer of stacked memory, a variety of emerging
big data applications can achieve significant
performance and energy-efficiency benefits. Various
approaches to the NMP logic layer architecture have
been studied to utilize the advantage of stacked
memory. While significant acceleration of specific
kernel operations has been derived from previous NMP
studies, an NMP-based system using an NMP logic
architecture capable of handling some specific kernel
operations can suffer from performance and energy
efficiency degradation caused by a significant
communication overhead between the host processor and
NMP stack. In this article, we first analyze the kernel
operations that can greatly improve the performance of
NMP-based systems in diverse emerging applications, and
then we analyze the architecture to efficiently process
the extracted kernel operations. This analysis confirms
that three categories of processing engines for NMP
logic are required for efficient processing of a
variety of emerging applications, and thus we propose a
Triple Engine Processor (TEP), a heterogeneous
near-memory processor with three types of computing
engines. These three types of engines are an in-order
core, a coerce-grain reconfigurable processor (CGRA),
and dedicated hardware. The proposed TEP provides about
3.4 times higher performance and 33\% greater energy
savings than the baseline 3D memory system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Patsilaras:2017:RRD,
author = "George Patsilaras and James Tuck",
title = "{ReDirect}: Reconfigurable Directories for Multicore
Architectures",
journal = j-TACO,
volume = "14",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3162015",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As we enter the dark silicon era, architects should
not envision designs in which every transistor remains
turned on permanently but rather ones in which portions
of the chip are judiciously turned on/off depending on
the characteristics of a workload. At the same time,
due to the increasing cost per transistor, architects
should also consider new ways to re-purpose transistors
to increase their architectural value. In this work, we
consider the design of directory-based cache coherence
in light of the dark silicon era and the need to
re-purpose transistors. We point out that directories
are not needed all of the time, and we argue that
directories (and coherence) should be off unless it is
actually needed for correctness. In our design,
directories will be disabled and powered off for
workloads with no sharing. Then only when parallel
workloads need cache coherence will directories be
enabled in proportion to the sharing that is present.
At the same time, we exploit the structural
similarities of directories and cache. If a directory
is idle, then we reconfigure it to be used as extra
capacity in the last-level cache. Since our novel
approach can keep most directories off, we are free to
select sparse overprovisioned directory designs that
are reconfigurable to large amounts of cache that can
significantly boost performance when the directory is
idle. We call these combined features Reconfigured Dark
Directories, since directories are usually dark (off)
and can be reconfigured. Our results for Reconfigurable
Dark Directories running SPEC 2006 applications show a
performance benefit, on average, of 17\% for an 8$
\times $ overprovisioned fully mapped directory on a
64-tile system under low system concurrency (10\% under
heavy concurrency), or a 29\% average speedup for a 2$
\times $ overprovisioned directory on 256-tile system
(10\% under heavy concurrency) to systems with a
conventional sparse directory design using the same
overprovisioning factor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Patil:2017:HHA,
author = "Adarsh Patil and Ramaswamy Govindarajan",
title = "{HAShCache}: Heterogeneity-Aware Shared {DRAMCache}
for Integrated Heterogeneous Systems",
journal = j-TACO,
volume = "14",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158641",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Integrated Heterogeneous System (IHS) processors pack
throughput-oriented General-Purpose Graphics Processing
Units (GPGPUs) alongside latency-oriented Central
Processing Units (CPUs) on the same die sharing certain
resources, e.g., shared last-level cache,
Network-on-Chip (NoC), and the main memory. The demands
for memory accesses and other shared resources from GPU
cores can exceed that of CPU cores by two to three
orders of magnitude. This disparity poses significant
problems in exploiting the full potential of these
architectures. In this article, we propose adding a
large-capacity stacked DRAM, used as a shared
last-level cache, for the IHS processors. However,
adding the DRAMCache naively, leaves significant
performance on the table due to the disparate demands
from CPU and GPU cores for DRAMCache and memory
accesses. In particular, the imbalance can
significantly reduce the performance benefits that the
CPU cores would have otherwise enjoyed with the
introduction of the DRAMCache, necessitating a
heterogeneity-aware management of this shared resource
for improved performance. In this article, we propose
three simple techniques to enhance the performance of
CPU application while ensuring very little to no
performance impact to the GPU. Specifically, we propose
(i) PrIS, a prioritization scheme for scheduling CPU
requests at the DRAMCache controller; (ii) ByE, a
selective and temporal bypassing scheme for CPU
requests at the DRAMCache; and (iii) Chaining, an
occupancy controlling mechanism for GPU lines in the
DRAMCache through pseudo-associativity. The resulting
cache, Heterogeneity-Aware Shared DRAMCache
(HAShCache), is heterogeneity-aware and can adapt
dynamically to address the inherent disparity of
demands in an IHS architecture. Experimental evaluation
of the proposed HAShCache results in an average system
performance improvement of 41\% over a naive DRAMCache
and over 200\% improvement over a baseline system with
no stacked DRAMCache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Alias:2017:OAC,
author = "Christophe Alias and Alexandru Plesco",
title = "Optimizing Affine Control With Semantic
Factorizations",
journal = j-TACO,
volume = "14",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3162017",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware accelerators generated by polyhedral
synthesis techniques make extensive use of affine
expressions (affine functions and convex polyhedra) in
control and steering logic. Since the control is
pipelined, these affine objects must be evaluated at
the same time for different values, which forbids
aggressive reuse of operators. In this article, we
propose a method to factorize a collection of affine
expressions without preventing pipelining. Our key
contributions are (i) to use semantic factorizations
exploiting arithmetic properties of addition and
multiplication and (ii) to rely on a cost function
whose minimization ensures correct usage of FPGA
resources. Our algorithm is totally parameterized by
the cost function, which can be customized to fit a
target FPGA. Experimental results on a large pool of
linear algebra kernels show a significant improvement
compared to traditional low-level RTL optimizations. In
particular, we show how our method reduces resource
consumption by revealing hidden strength reductions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Matheou:2017:DDC,
author = "George Matheou and Paraskevas Evripidou",
title = "Data-Driven Concurrency for High Performance
Computing",
journal = j-TACO,
volume = "14",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3162014",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this work, we utilize dynamic dataflow/data-driven
techniques to improve the performance of high
performance computing (HPC) systems. The proposed
techniques are implemented and evaluated through an
efficient, portable, and robust programming framework
that enables data-driven concurrency on HPC systems.
The proposed framework is based on data-driven
multithreading (DDM), a hybrid control-flow/dataflow
model that schedules threads based on data availability
on sequential processors. The proposed framework was
evaluated using several benchmarks, with different
characteristics, on two different systems: a 4-node AMD
system with a total of 128 cores and a 64-node Intel
HPC system with a total of 768 cores. The performance
evaluation shows that the proposed framework scales
well and tolerates scheduling overheads and memory
latencies effectively. We also compare our framework to
MPI, DDM-VM, and OmpSs@Cluster. The comparison results
show that the proposed framework obtains comparable or
better performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Georgakoudis:2017:SSA,
author = "Giorgis Georgakoudis and Hans Vandierendonck and Peter
Thoman and Bronis R. {De Supinski} and Thomas Fahringer
and Dimitrios S. Nikolopoulos",
title = "{SCALO}: Scalability-Aware Parallelism Orchestration
for Multi-Threaded Workloads",
journal = j-TACO,
volume = "14",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158643",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Shared memory machines continue to increase in scale
by adding more parallelism through additional cores and
complex memory hierarchies. Often, executing multiple
applications concurrently, dividing among them hardware
threads, provides greater efficiency rather than
executing a single application with large thread
counts. However, contention for shared resources can
limit the improvement of concurrent application
execution: orchestrating the number of threads used by
each application and is essential. In this article, we
contribute SCALO, a solution to orchestrate concurrent
application execution to increase throughput. SCALO
monitors co-executing applications at runtime to
evaluate their scalability. Its optimizing thread
allocator analyzes these scalability estimates to adapt
the parallelism of each program. Unlike previous
approaches, SCALO differs by including dynamic
contention effects on scalability and by controlling
the parallelism during the execution of parallel
regions. Thus, it improves throughput when other
state-of-the-art approaches fail and outperforms them
by up to 40\% when they succeed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Baroudi:2017:OTB,
author = "Toufik Baroudi and Rachid Seghir and Vincent
Loechner",
title = "Optimization of Triangular and Banded Matrix
Operations Using $2$ d-Packed Layouts",
journal = j-TACO,
volume = "14",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3162016",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Over the past few years, multicore systems have become
increasingly powerful and thereby very useful in
high-performance computing. However, many applications,
such as some linear algebra algorithms, still cannot
take full advantage of these systems. This is mainly
due to the shortage of optimization techniques dealing
with irregular control structures. In particular, the
well-known polyhedral model fails to optimize loop
nests whose bounds and/or array references are not
affine functions. This is more likely to occur when
handling sparse matrices in their packed formats. In
this article, we propose using 2d-packed layouts and
simple affine transformations to enable optimization of
triangular and banded matrix operations. The benefit of
our proposal is shown through an experimental study
over a set of linear algebra benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2018:IEE,
author = "Hochan Lee and Mansureh S. Moghaddam and Dongkwan Suh
and Bernhard Egger",
title = "Improving Energy Efficiency of Coarse-Grain
Reconfigurable Arrays Through Modulo Schedule
Compression\slash Decompression",
journal = j-TACO,
volume = "15",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3162018",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modulo-scheduled course-grain reconfigurable array
(CGRA) processors excel at exploiting loop-level
parallelism at a high performance per watt ratio. The
frequent reconfiguration of the array, however, causes
between 25\% and 45\% of the consumed chip energy to be
spent on the instruction memory and fetches therefrom.
This article presents a hardware/software codesign
methodology for such architectures that is able to
reduce both the size required to store the
modulo-scheduled loops and the energy consumed by the
instruction decode logic. The hardware modifications
improve the spatial organization of a CGRA's execution
plan by reorganizing the configuration memory into
separate partitions based on a statistical analysis of
code. A compiler technique optimizes the generated code
in the temporal dimension by minimizing the number of
signal changes. The optimizations achieve, on average,
a reduction in code size of more than 63\% and in
energy consumed by the instruction decode logic by 70\%
for a wide variety of application domains.
Decompression of the compressed loops can be performed
in hardware with no additional latency, rendering the
presented method ideal for low-power CGRAs running at
high frequencies. The presented technique is orthogonal
to dictionary-based compression schemes and can be
combined to achieve a further reduction in code size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sangaiah:2018:SSA,
author = "Karthik Sangaiah and Michael Lui and Radhika Jagtap
and Stephan Diestelhorst and Siddharth Nilakantan and
Ankit More and Baris Taskin and Mark Hempstead",
title = "{SynchroTrace}: Synchronization-Aware
Architecture-Agnostic Traces for Lightweight Multicore
Simulation of {CMP} and {HPC} Workloads",
journal = j-TACO,
volume = "15",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158642",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Trace-driven simulation of chip multiprocessor (CMP)
systems offers many advantages over execution-driven
simulation, such as reducing simulation time and
complexity, allowing portability, and scalability.
However, trace-based simulation approaches have
difficulty capturing and accurately replaying
multithreaded traces due to the inherent nondeterminism
in the execution of multithreaded programs. In this
work, we present SynchroTrace, a scalable, flexible,
and accurate trace-based multithreaded simulation
methodology. By recording synchronization events
relevant to modern threading libraries (e.g., Pthreads
and OpenMP) and dependencies in the traces, independent
of the host architecture, the methodology is able to
accurately model the nondeterminism of multithreaded
programs for different hardware platforms and threading
paradigms. Through capturing high-level instruction
categories, the SynchroTrace average CPI trace Replay
timing model offers fast and accurate simulation of
many-core in-order CMPs. We perform two case studies to
validate the SynchroTrace simulation flow against the
gem5 full-system simulator: (1) a constraint-based
design space exploration with traditional CMP
benchmarks and (2) a thread-scalability study with
HPC-representative applications. The results from these
case studies show that (1) our trace-based approach
with trace filtering has a peak speedup of up to 18.7$
\times $ over simulation in gem5 full-system with an
average of 9.6$ \times $ speedup, (2) SynchroTrace
maintains the thread-scaling accuracy of gem5 and can
efficiently scale up to 64 threads, and (3)
SynchroTrace can trace in one platform and model any
platform in early stages of design.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zheng:2018:ESG,
author = "Long Zheng and Xiaofei Liao and Hai Jin",
title = "Efficient and Scalable Graph Parallel Processing With
Symbolic Execution",
journal = j-TACO,
volume = "15",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3170434",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Existing graph processing essentially relies on the
underlying iterative execution with synchronous (Sync)
and/or asynchronous (Async) engine. Nevertheless, they
both suffer from a wide class of inherent serialization
arising from data interdependencies within a graph. In
this article, we present SymGraph, a judicious graph
engine with symbolic iteration that enables the
parallelism of dependent computation on vertices.
SymGraph allows using abstract symbolic value (instead
of the concrete value) for the computation if the
desired data is unavailable. To maximize the potential
of symbolic iteration, we propose a chain of tailored
sophisticated techniques, enabling SymGraph to scale
out with a new milestone of efficiency for large-scale
graph processing. We evaluate SymGraph in comparison to
Sync, Async, and a hybrid of Sync and Async engines.
Our results on 12 nodes show that SymGraph outperforms
all three graph engines by 1.93x (vs. Sync), 1.98x (vs.
Async), and 1.57x (vs. Hybrid) on average. In
particular, the performance for PageRank on 32 nodes
can be dramatically improved by 16.5x (vs. Sync), 23.3x
(vs. Async), and 12.1x (vs. Hybrid), respectively. The
efficiency of SymGraph is also validated with at least
one order of magnitude improvement in contrast to three
specialized graph systems (Naiad, GraphX, and PGX.D).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jo:2018:DSD,
author = "Jae-Eon Jo and Gyu-Hyeon Lee and Hanhwi Jang and
Jaewon Lee and Mohammadamin Ajdari and Jangwoo Kim",
title = "{DiagSim}: Systematically Diagnosing Simulators for
Healthy Simulations",
journal = j-TACO,
volume = "15",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177959",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simulators are the most popular and useful tool to
study computer architecture and examine new ideas.
However, modern simulators have become prohibitively
complex (e.g., 200K+ lines of code) to fully understand
and utilize. Users therefore end up analyzing and
modifying only the modules of interest (e.g., branch
predictor, register file) when performing simulations.
Unfortunately, hidden details and inter-module
interactions of simulators create discrepancies between
the expected and actual module behaviors. Consequently,
the effect of modifying the target module may be
amplified or masked and the users get inaccurate
insights from expensive simulations. In this article,
we propose DiagSim, an efficient and systematic method
to diagnose simulators. It ensures the target modules
behave as expected to perform simulation in a healthy
(i.e., accurate and correct) way. DiagSim is efficient
in that it quickly pinpoints the modules showing
discrepancies and guides the users to inspect the
behavior without investigating the whole simulator.
DiagSim is systematic in that it hierarchically tests
the modules to guarantee the integrity of individual
diagnosis and always provide reliable results. We
construct DiagSim based on generic category-based
diagnosis ideas to encourage easy expansion of the
diagnosis. We diagnose three popular open source
simulators and discover hidden details including
implicitly reserved resources, un-documented latency
factors, and hard-coded module parameter values. We
observe that these factors have large performance
impacts (up to 156\%) and illustrate that our diagnosis
can correctly detect and eliminate them.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kondguli:2018:CME,
author = "Sushant Kondguli and Michael Huang",
title = "A Case for a More Effective, Power-Efficient Turbo
Boosting",
journal = j-TACO,
volume = "15",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3170433",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single-thread performance and throughput often pose
different design constraints and require compromises.
Mainstream CPUs today incorporate a non-trivial number
of cores, even for mobile devices. For power and
thermal considerations, by default, a single core does
not operate at the maximum performance level. When
operating conditions allow, however, commercial
products often rely on turbo boosting, which
temporarily increases the clock frequency to increase
single-thread performance. However, increasing clock
speed may result in a poor performance return for
invested energy. In this article, we make a case for a
more effective boosting strategy, which invests energy
in activities with the best estimated return. In
addition to running faster clocks, we can also use a
look-ahead thread to overlap the penalties of cache
misses and branch mispredicts. Overall, for similar
power consumptions, the proposed adaptive turbo
boosting strategy can achieve about twice the
performance benefits while halving the energy
overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2018:ESE,
author = "Kuan-Chung Chen and Chung-Ho Chen",
title = "Enabling {SIMT} Execution Model on Homogeneous
Multi-Core System",
journal = j-TACO,
volume = "15",
number = "1",
pages = "6:1--6:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177960",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single-instruction multiple-thread (SIMT) machine
emerges as a primary computing device in
high-performance computing, since the SIMT execution
paradigm can exploit data-level parallelism
effectively. This article explores the SIMT execution
potential on homogeneous multi-core processors, which
generally run in multiple-instruction multiple-data
(MIMD) mode when utilizing the multi-core resources. We
address three architecture issues in enabling SIMT
execution model on multi-core processor, including
multithreading execution model, kernel thread context
placement, and thread divergence. For the SIMT
execution model, we propose a fine-grained
multithreading mechanism on an ARM-based multi-core
system. Each of the processor cores stores the kernel
thread contexts in its L1 data cache for per-cycle
thread-switching requirement. For divergence-intensive
kernels, an Inner Conditional Statement First
(ICS-First) mechanism helps early re-convergence to
occur and significantly improves the performance. The
experiment results show that effectiveness in
data-parallel processing reduces on average 36\%
dynamic instructions, and boosts the SIMT executions to
achieve on average 1.52$ \times $ and up to 5$ \times $
speedups over the MIMD counterpart for OpenCL
benchmarks for single issue in-order processor cores.
By using the explicit vectorization optimization on the
kernels, the SIMT model gains further benefits from the
SIMD extension and achieves 1.71$ \times $ speedup over
the MIMD approach. The SIMT model using in-order
superscalar processor cores outperforms the MIMD model
that uses superscalar out-of-order processor cores by
40\%. The results show that, to exploit data-level
parallelism, enabling the SIMT model on homogeneous
multi-core processors is important.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2018:SSM,
author = "Mingzhe Zhang and King Tin Lam and Xin Yao and Cho-Li
Wang",
title = "{SIMPO}: a Scalable In-Memory Persistent Object
Framework Using {NVRAM} for Reliable Big Data
Computing",
journal = j-TACO,
volume = "15",
number = "1",
pages = "7:1--7:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3167972",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "While CPU architectures are incorporating many more
cores to meet ever-bigger workloads, advance in
fault-tolerance support is indispensable for sustaining
system performance under reliability constraints.
Emerging non-volatile memory technologies are yielding
fast, dense, and energy-efficient NVRAM that can
dethrone SSD drives for persisting data. Research on
using NVRAM to enable fast in-memory data persistence
is ongoing. In this work, we design and implement a
persistent object framework, dubbed scalable in-memory
persistent object (SIMPO), which exploits NVRAM,
alongside DRAM, to support efficient object persistence
in highly threaded big data applications. Based on
operation logging, we propose a new programming model
that classifies functions into instant and deferrable
groups. SIMPO features a streamlined execution model,
which allows lazy evaluation of deferrable functions
and is well suited to big data computing workloads that
would see improved data locality and concurrency. Our
log recording and checkpointing scheme is effectively
optimized towards NVRAM, mitigating its long write
latency through write-combining and consolidated
flushing techniques. Efficient persistent object
management with features including safe references and
memory leak prevention is also implemented and tailored
to NVRAM. We evaluate a wide range of SIMPO-enabled
applications with machine learning, high-performance
computing, and database workloads on an emulated hybrid
memory architecture and a real hybrid memory machine
with NVDIMM. Compared with native applications without
persistence, experimental results show that SIMPO
incurs less than 5\% runtime overhead on both platforms
and even gains up to 2.5$ \times $ speedup and 84\%
increase in throughput in highly threaded situations on
the two platforms, respectively, thanks to the
streamlined execution model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Deng:2018:EML,
author = "Bobin Deng and Sriseshan Srikanth and Eric R. Hein and
Thomas M. Conte and Erik Debenedictis and Jeanine Cook
and Michael P. Frank",
title = "Extending {Moore's Law} via Computationally
Error-Tolerant Computing",
journal = j-TACO,
volume = "15",
number = "1",
pages = "8:1--8:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177837",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dennard scaling has ended. Lowering the voltage supply
(V$_{dd}$) to sub-volt levels causes intermittent
losses in signal integrity, rendering further scaling
(down) no longer acceptable as a means to lower the
power required by a processor core. However, it is
possible to correct the occasional errors caused due to
lower V$_{dd}$ in an efficient manner and effectively
lower power. By deploying the right amount and kind of
redundancy, we can strike a balance between overhead
incurred in achieving reliability and energy savings
realized by permitting lower V$_{dd}$. One promising
approach is the Redundant Residue Number System (RRNS)
representation. Unlike other error correcting codes,
RRNS has the important property of being closed under
addition, subtraction and multiplication, thus enabling
computational error correction at a fraction of an
overhead compared to conventional approaches. We use
the RRNS scheme to design a Computationally-Redundant,
Energy-Efficient core, including the microarchitecture,
Instruction Set Architecture (ISA) and RRNS centered
algorithms. From the simulation results, this RRNS
system can reduce the energy-delay-product by about 3$
\times $ for multiplication intensive workloads and by
about 2$ \times $ in general, when compared to a
non-error-correcting binary core.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dice:2018:IPH,
author = "Dave Dice and Maurice Herlihy and Alex Kogan",
title = "Improving Parallelism in Hardware Transactional
Memory",
journal = j-TACO,
volume = "15",
number = "1",
pages = "9:1--9:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177962",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Today's hardware transactional memory (HTM) systems
rely on existing coherence protocols, which implement a
requester-wins strategy. This, in turn, leads to poor
performance when transactions frequently conflict,
causing them to resort to a non-speculative fallback
path. Often, such a path severely limits parallelism.
In this article, we propose very simple architectural
changes to the existing requester-wins HTM
implementations that enhance conflict resolution
between hardware transactions and thus improve their
parallelism. Our idea is compatible with existing HTM
systems, requires no changes to target applications
that employ traditional lock synchronization, and is
shown to provide robust performance benefits.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2018:BEE,
author = "Namhyung Kim and Junwhan Ahn and Kiyoung Choi and
Daniel Sanchez and Donghoon Yoo and Soojung Ryu",
title = "{Benzene}: an Energy-Efficient Distributed Hybrid
Cache Architecture for Manycore Systems",
journal = j-TACO,
volume = "15",
number = "1",
pages = "10:1--10:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177963",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes Benzene, an energy-efficient
distributed SRAM/STT-RAM hybrid cache for manycore
systems running multiple applications. It is based on
the observation that a na{\"\i}ve application of hybrid
cache techniques to distributed caches in a manycore
architecture suffers from limited energy reduction due
to uneven utilization of scarce SRAM. We propose
two-level optimization techniques: intra-bank and
inter-bank. Intra-bank optimization leverages highly
associative cache design, achieving more uniform
distribution of writes within a bank. Inter-bank
optimization evenly balances the amount of
write-intensive data across the banks. Our evaluation
results show that Benzene significantly reduces energy
consumption of distributed hybrid caches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ao:2018:POH,
author = "Yulong Ao and Chao Yang and Fangfang Liu and Wanwang
Yin and Lijuan Jiang and Qiao Sun",
title = "Performance Optimization of the {HPCG} Benchmark on
the {Sunway TaihuLight Supercomputer}",
journal = j-TACO,
volume = "15",
number = "1",
pages = "11:1--11:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3182177",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/super.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we present some key techniques for
optimizing HPCG on Sunway TaihuLight and demonstrate
how to achieve high performance in memory-bound
applications by exploiting specific characteristics of
the hardware architecture. In particular, we utilize a
block multicoloring approach for parallelization and
propose methods such as requirement-based data mapping
and customized gather collective to enhance the
effective memory bandwidth. Experiments indicate that
the optimized HPCG code can sustain 77\% of the
theoretical memory bandwidth and scale to the full
system of more than 10 million cores, with an
aggregated performance of 480.8 Tflop/s and a weak
scaling efficiency of 87.3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rashidi:2018:IMP,
author = "Saeed Rashidi and Majid Jalili and Hamid
Sarbazi-Azad",
title = "Improving {MLC PCM} Performance through Relaxed Write
and Read for Intermediate Resistance Levels",
journal = j-TACO,
volume = "15",
number = "1",
pages = "12:1--12:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177965",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase Change Memory (PCM) is one of the most promising
candidates to be used at the main memory level of the
memory hierarchy due to poor scalability, considerable
leakage power, and high cost/bit of DRAM. PCM is a new
resistive memory that is capable of storing data based
on resistance values. The wide resistance range of PCM
allows for storing multiple bits per cell (MLC) rather
than a single bit per cell (SLC). Unfortunately, higher
density of MLC PCM comes at the expense of longer
read/write latency, higher soft error rate, higher
energy consumption, and earlier wearout compared to the
SLC PCM. Some studies suggest removing the most
error-prone level to mitigate soft error and write
latency of MLC PCM, hence introducing a less dense
memory called Tri-Level memory. Another scheme, called
M-Metric, proposes a new read metric to address the
soft error problem in MLC PCM. In order to deal with
the limited lifetime of PCM, some extra storage per
memory line is required to correct permanent hard
errors (stuck-at faults). Since the extra storage is
used only when permanent faults occur, it has a low
utilization for a long time before hard errors start to
occur. In this article, we utilize the extra storage to
improve the read/write latency in a 2-bit MLC PCM using
a relaxation scheme for reading and writing the cells
for intermediate resistance levels. More specifically,
we combine the most time-consuming levels (intermediate
resistance levels) to reduce the number of resistance
levels (making a Tri-Level PCM) and therefore improve
write latency. We then store some error correction
metadata in the extra storage section to successfully
retrieve the exact data values in the read operation.
We also modify the Tri-Level PCM cell to increase its
read latency when the M-Metric scheme is used.
Evaluation results show that the proposed scheme
improves read latency by 57.2\%, write latency by
56.1\%, and overall system performance (IPC) by 26.9\%
over the baseline. It is noteworthy that combining the
proposed scheme and FPC compression method improves
read latency by 75.2\%, write latency by 67\%, and
overall system performance (IPC) by 37.4\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2018:OCN,
author = "Wenlai Zhao and Haohuan Fu and Jiarui Fang and Weijie
Zheng and Lin Gan and Guangwen Yang",
title = "Optimizing Convolutional Neural Networks on the
{Sunway TaihuLight Supercomputer}",
journal = j-TACO,
volume = "15",
number = "1",
pages = "13:1--13:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177885",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/super.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Sunway TaihuLight supercomputer is powered by
SW26010, a new 260-core processor designed with on-chip
fusion of heterogeneous cores. In this article, we
present our work on optimizing the training process of
convolutional neural networks (CNNs) on the Sunway
TaihuLight supercomputer. Specifically, a highly
efficient library (swDNN) and a customized Caffe
framework (swCaffe) are proposed. Architecture-oriented
optimization methods targeting the many-core
architecture of SW26010 are introduced and are able to
achieve 48$ \times $ speedup for the convolution
routine in swDNN and 4$ \times $ speedup for the
complete training process of the VGG-16 network using
swCaffe, compared to the unoptimized algorithm and
framework. Compared to the cuDNN library and the Caffe
framework based on the NVIDIA K40m GPU, the proposed
swDNN library and swCaffe framework on SW26010 have
nearly half the performance of K40m in single-precision
and have 3.6$ \times $ and 1.8$ \times $ speedup over
K40m in double precision, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mbakoyiannis:2018:EPC,
author = "Dimitrios Mbakoyiannis and Othon Tomoutzoglou and
George Kornaros",
title = "Energy-Performance Considerations for Data Offloading
to {FPGA}-Based Accelerators Over {PCIe}",
journal = j-TACO,
volume = "15",
number = "1",
pages = "14:1--14:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3180263",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern data centers increasingly employ FPGA-based
heterogeneous acceleration platforms as a result of
their great potential for continued performance and
energy efficiency. Today, FPGAs provide more hardware
parallelism than is possible with GPUs or CPUs, whereas
C-like programming environments facilitate shorter
development time, even close to software cycles. In
this work, we address limitations and overheads in
access and transfer of data to accelerators over common
CPU-accelerator interconnects such as PCIe. We present
three different FPGA accelerator dispatching methods
for streaming applications (e.g., multimedia, vision
computing). The first uses zero-copy data transfers and
on-chip scratchpad memory (SPM) for energy efficiency,
and the second uses also zero-copy but shared copy
engines among different accelerator instances and local
external memory. The third uses the processor's memory
management unit to acquire the physical address of user
pages and uses scatter-gather data transfers with SPM.
Even though all techniques exhibit advantages in terms
of scalability and relieve the processor from control
overheads through using integrated schedulers, the
first method presents the best energy-efficient
acceleration in streaming applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2018:GPV,
author = "Zhen Lin and Michael Mantor and Huiyang Zhou",
title = "{GPU} Performance vs. Thread-Level Parallelism:
Scalability Analysis and a Novel Way to Improve {TLP}",
journal = j-TACO,
volume = "15",
number = "1",
pages = "15:1--15:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177964",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphics Processing Units (GPUs) leverage massive
thread-level parallelism (TLP) to achieve high
computation throughput and hide long memory latency.
However, recent studies have shown that the GPU
performance does not scale with the GPU occupancy or
the degrees of TLP that a GPU supports, especially for
memory-intensive workloads. The current understanding
points to L1 D-cache contention or off-chip memory
bandwidth. In this article, we perform a novel
scalability analysis from the perspective of throughput
utilization of various GPU components, including
off-chip DRAM, multiple levels of caches, and the
interconnect between L1 D-caches and L2 partitions. We
show that the interconnect bandwidth is a critical
bound for GPU performance scalability. For the
applications that do not have saturated throughput
utilization on a particular resource, their performance
scales well with increased TLP. To improve TLP for such
applications efficiently, we propose a fast context
switching approach. When a warp/thread block (TB) is
stalled by a long latency operation, the context of the
warp/TB is spilled to spare on-chip resource so that a
new warp/TB can be launched. The switched-out warp/TB
is switched back when another warp/TB is completed or
switched out. With this fine-grain fast context
switching, higher TLP can be supported without
increasing the sizes of critical resources like the
register file. Our experiment shows that the
performance can be improved by up to 47\% and a
geometric mean of 22\% for a set of applications with
unsaturated throughput utilization. Compared to the
state-of-the-art TLP improvement scheme, our proposed
scheme achieves 12\% higher performance on average and
16\% for unsaturated benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zinenko:2018:VPM,
author = "Oleksandr Zinenko and St{\'e}phane Huot and C{\'e}dric
Bastoul",
title = "Visual Program Manipulation in the Polyhedral Model",
journal = j-TACO,
volume = "15",
number = "1",
pages = "16:1--16:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177961",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallelism is one of the key performance sources in
modern computer systems. When heuristics-based
automatic parallelization fails to improve performance,
a cumbersome and error-prone manual transformation is
often required. As a solution, we propose an
interactive visual approach building on the polyhedral
model that visualizes exact dependencies and
parallelism; decomposes and replays a complex
automatically computed transformation step by step; and
allows for directly manipulating the visual
representation as a means of transforming the program
with immediate feedback. User studies suggest that our
visualization is understood by experts and nonexperts
alike, and that it may favor an exploratory approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shihab:2018:RFD,
author = "Mustafa M. Shihab and Jie Zhang and Myoungsoo Jung and
Mahmut Kandemir",
title = "{ReveNAND}: a Fast-Drift-Aware Resilient {$3$D} {NAND}
Flash Design",
journal = j-TACO,
volume = "15",
number = "2",
pages = "17:1--17:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3184744",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The paradigm shift from planar (two dimensional (2D))
to vertical (three-dimensional (3D)) models has placed
the NAND flash technology on the verge of a design
evolution that can handle the demands of
next-generation storage applications. However, it also
introduces challenges that may obstruct the realization
of such 3D NAND flash. Specifically, we observed that
the fast threshold drift (fast-drift) in a charge-trap
flash-based 3D NAND cell can make it lose a critical
fraction of the stored charge relatively soon after
programming and generate errors. In this work, we first
present an elastic read reference (V$_{Ref}$) scheme
(ERR) for reducing such errors in ReveNAND-our
fast-drift aware 3D NAND design. To address the
inherent limitation of the adaptive V$_{Ref}$, we
introduce a new intra-block page organization
(hitch-hike) that can enable stronger error correction
for the error-prone pages. In addition, we propose a
novel reinforcement-learning-based smart data refill
scheme (iRefill) to counter the impact of fast-drift
with minimum performance and hardware overhead.
Finally, we present the first analytic model to
characterize fast-drift and evaluate its system-level
impact. Our results show that, compared to conventional
3D NAND design, our ReveNAND can reduce fast-drift
errors by 87\%, on average, and can lower the ECC
latency and energy overheads by 13$ \times $ and 10$
\times $, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zahedi:2018:MHD,
author = "Seyed Majid Zahedi and Songchun Fan and Benjamin C.
Lee",
title = "Managing Heterogeneous Datacenters with Tokens",
journal = j-TACO,
volume = "15",
number = "2",
pages = "18:1--18:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3191821",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Ensuring fairness in a system with scarce, preferred
resources requires time sharing. We consider a
heterogeneous system with a few ``big'' and many
``small'' processors. We allocate heterogeneous
processors using a novel token mechanism, which frames
the allocation problem as a repeated game. At each
round, users request big processors and spend a token
if their request is granted. We analyze the game and
optimize users' strategies to produce an equilibrium.
In equilibrium, allocations balance performance and
fairness. Our mechanism outperforms classical, fair
mechanisms by 1.7$ \times $, on average, in performance
gains, and is competitive with a performance maximizing
mechanism.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pericas:2018:EPA,
author = "Miquel Peric{\`a}s",
title = "{Elastic Places}: an Adaptive Resource Manager for
Scalable and Portable Performance",
journal = j-TACO,
volume = "15",
number = "2",
pages = "19:1--19:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3185458",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The diversity and complexity of modern computing
platforms makes the development of high-performance
software challenging. Designing scalable software
requires tuning for a large set of resources, including
cores (parallelism), memory bandwidths, and various
levels of private and shared caches, as well as
developing strategies for optimizing locality. But
highly optimized implementations are often inefficient
when executed on a different platform. This is the
performance portability problem. One approach to
scalability and portability is to tune the amount of
work per task based on runtime overheads and
concurrency. This results in a better balance between
parallelism and scheduling overheads, but it can
neither tune data reuse nor avoid inter-task
interference. We propose a complementary approach that
consists in tuning the amount of resources allocated to
tasks and combine it with software-defined task
topologies to provide portable locality. These ideas
are combined into a low-overhead resource management
scheme called Elastic Places. Elastic Places is
implemented in the XiTAO software framework but the
core ideas are equally applicable to other languages
and runtimes. Experimental results on an AMD-based NUMA
machine and an Intel Knights Landing system show that
elastic places provides both high scalability and
performance portability, with speed-ups of up to 2.3$
\times $ on both platforms compared to state-of-the-art
runtimes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Olson:2018:CLM,
author = "Matthew Benjamin Olson and Joseph T. Teague and
Divyani Rao and Michael R. JANTZ and Kshitij A. Doshi
and Prasad A. Kulkarni",
title = "Cross-Layer Memory Management to Improve {DRAM} Energy
Efficiency",
journal = j-TACO,
volume = "15",
number = "2",
pages = "20:1--20:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3196886",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Controlling the distribution and usage of memory power
is often difficult, because these effects typically
depend on activity across multiple layers of the
vertical execution stack. To address this challenge, we
construct a novel and collaborative framework that
employs object placement, cross-layer communication,
and page-level management to effectively distribute
application objects in the DRAM hardware to achieve
desired power/performance goals. This work describes
the design and implementation of our framework, which
is the first to integrate automatic object profiling
and analysis at the application layer with fine-grained
management of memory hardware resources in the
operating system. We demonstrate the utility of this
framework by employing it to control memory power
consumption more effectively. First, we design a custom
memory-intensive workload to show the potential of this
approach to reduce DRAM energy consumption. Next, we
develop sampling and profiling-based analyses and
modify the code generator in the HotSpot VM to
understand object usage patterns and automatically
control the placement of hot and cold objects in a
partitioned VM heap. This information is communicated
to the operating system, which uses it to map the
logical application pages to the appropriate DRAM
modules according to user-defined provisioning goals.
The evaluation shows that our Java VM-based framework
achieves our goal of significant DRAM energy savings
across a variety of workloads, without any source code
modifications or recompilations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zoni:2018:DEP,
author = "Davide Zoni and Luca Colombo and William Fornaciari",
title = "{DarkCache}: Energy-Performance Optimization of Tiled
Multi-Cores by Adaptively Power-Gating {LLC} Banks",
journal = j-TACO,
volume = "15",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3186895",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Last Level Cache (LLC) is a key element to improve
application performance in multi-cores. To handle the
worst case, the main design trend employs tiled
architectures with a large LLC organized in banks,
which goes underutilized in several realistic
scenarios. Our proposal, named DarkCache, aims at
properly powering off such unused banks to optimize the
Energy-Delay Product (EDP) through an adaptive cache
reconfiguration, thus aggressively reducing the leakage
energy. The implemented solution is general and it can
recognize and skip the activation of the DarkCache
policy for the few strong memory intensive applications
that actually require the use of the entire LLC. The
validation has been carried out on 16- and 64-core
architectures also accounting for two state-of-the-art
methodologies. Compared to the baseline solution,
DarkCache exhibits a performance overhead within 2\%
and an average EDP improvement of 32.58\% and 36.41\%
considering 16 and 64 cores, respectively. Moreover,
DarkCache shows an average EDP gain between 16.15\% (16
cores) and 21.05\% (64 cores) compared to the best
state-of-the-art we evaluated, and it confirms a good
scalability since the gain improves with the size of
the architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2018:CNC,
author = "Yang Zhang and Dan Feng and Wei Tong and Yu Hua and
Jingning Liu and Zhipeng Tan and Chengning Wang and
Bing Wu and Zheng Li and Gaoxiang Xu",
title = "{CACF}: a Novel Circuit Architecture Co-optimization
Framework for Improving Performance, Reliability and
Energy of {ReRAM}-based Main Memory System",
journal = j-TACO,
volume = "15",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3195799",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging Resistive Random Access Memory (ReRAM) is a
promising candidate as the replacement for DRAM due to
its low standby power, high density, high scalability,
and nonvolatility. By employing the unique crossbar
structure, ReRAM can be constructed with extremely high
density. However, the crossbar ReRAM faces some serious
challenges in terms of performance, reliability, and
energy consumption. First, ReRAM's crossbar structure
causes an IR drop problem due to wire resistance and
sneak currents, which results in nonuniform access
latency in ReRAM banks and reduces its reliability.
Second, without access transistors in the crossbar
structure, write disturbance results in serious data
reliability problem. Third, the access latency,
reliability, and energy use of ReRAM arrays are
significantly influenced by the data patterns involved
in a write operation. To overcome the challenges of the
crossbar ReRAM, we propose a novel circuit architecture
co-optimization framework for improving the
performance, reliability, and energy use of ReRAM-based
main memory system, called CACF. The proposed CACF
consists of three levels, including the circuit level,
circuit architecture level, and architecture level. At
the circuit level, to reduce the IR drops along
bitlines, we propose a double-sided write driver design
by applying write drivers along both sides of bitlines
and selectively activating the write drivers. At the
circuit architecture level, to address the write
disturbance with low overheads, we propose a RESET
disturbance detection scheme by adding disturbance
reference cells and conditionally performing refresh
operations. At the architecture level, a region
partition with address remapping method is proposed to
leverage the nonuniform access latency in ReRAM banks,
and two flip schemes are proposed in different regions
to optimize the data patterns involved in a write
operation. The experimental results show that CACF
improves system performance by 26.1\%, decreases memory
access latency by 22.4\%, shortens running time by
20.1\%, and reduces energy consumption by 21.6\% on
average over an aggressive baseline. Meanwhile, CACF
significantly improves the reliability of ReRAM-based
memory systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stawinoga:2018:PTC,
author = "Nicolai Stawinoga and Tony Field",
title = "Predictable Thread Coarsening",
journal = j-TACO,
volume = "15",
number = "2",
pages = "23:1--23:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3194242",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Thread coarsening on GPUs combines the work of several
threads into one. We show how thread coarsening can be
implemented as a fully automated compile-time
optimisation that estimates the optimal coarsening
factor based on a low-cost, approximate static analysis
of cache line re-use and an occupancy prediction model.
We evaluate two coarsening strategies on three
different NVidia GPU architectures. For NVidia
reduction kernels we achieve a maximum speedup of
5.08x, and for the Rodinia benchmarks we achieve a mean
speedup of 1.30x over 8 of 19 kernels that were
determined safe to coarsen.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Roy:2018:NCN,
author = "Probir Roy and Shuaiwen Leon Song and Sriram
Krishnamoorthy and Abhinav Vishnu and Dipanjan Sengupta
and Xu Liu",
title = "{NUMA-Caffe}: {NUMA}-Aware Deep Learning Neural
Networks",
journal = j-TACO,
volume = "15",
number = "2",
pages = "24:1--24:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3199605",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Convolution Neural Networks (CNNs), a special
subcategory of Deep Learning Neural Networks (DNNs),
have become increasingly popular in industry and
academia for their powerful capability in pattern
classification, image processing, and speech
recognition. Recently, they have been widely adopted in
High Performance Computing (HPC) environments for
solving complex problems related to modeling, runtime
prediction, and big data analysis. Current
state-of-the-art designs for DNNs on modern multi- and
many-core CPU architectures, such as variants of Caffe,
have reported promising performance in speedup and
scalability, comparable with the GPU implementations.
However, modern CPU architectures employ Non-Uniform
Memory Access (NUMA) technique to integrate multiple
sockets, which incurs unique challenges for designing
highly efficient CNN frameworks. Without a careful
design, DNN frameworks can easily suffer from long
memory latency due to a large number of memory accesses
to remote NUMA domains, resulting in poor scalability.
To address this challenge, we propose NUMA-aware
multi-solver-based CNN design, named NUMA-Caffe, for
accelerating deep learning neural networks on multi-
and many-core CPU architectures. NUMA-Caffe is
independent of DNN topology, does not impact network
convergence rates, and provides superior scalability to
the existing Caffe variants. Through a thorough
empirical study on four contemporary NUMA-based multi-
and many-core architectures, our experimental results
demonstrate that NUMA-Caffe significantly outperforms
the state-of-the-art Caffe designs in terms of both
throughput and scalability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ejaz:2018:DDD,
author = "Ahsen Ejaz and Vassilios Papaefstathiou and Ioannis
Sourdis",
title = "{DDRNoC}: Dual Data-Rate Network-on-Chip",
journal = j-TACO,
volume = "15",
number = "2",
pages = "25:1--25:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3200201",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article introduces DDRNoC, an on-chip
interconnection network capable of routing packets at
Dual Data Rate. The cycle time of current 2D-mesh
Network-on-Chip routers is limited by their control as
opposed to the datapath (switch and link traversal),
which exhibits significant slack. DDRNoC capitalizes on
this observation, allowing two flits per cycle to share
the same datapath. Thereby, DDRNoC achieves higher
throughput than a Single Data Rate (SDR) network.
Alternatively, using lower voltage circuits, the above
slack can be exploited to reduce power consumption
while matching the SDR network throughput. In addition,
DDRNoC exhibits reduced clock distribution power,
improving energy efficiency, as it needs a slower clock
than a SDR network that routes packets at the same
rate. Post place and route results in 28nm technology
show that, compared to an iso-voltage (1.1V) SDR
network, DDRNoC improves throughput proportionally to
the SDR datapath slack. Moreover, a low-voltage (0.95V)
DDRNoC implementation converts that slack to power
reduction offering the 1.1V SDR throughput at a
substantially lower energy cost.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cai:2018:ESH,
author = "Ying Cai and Yulong Ao and Chao Yang and Wenjing Ma
and Haitao Zhao",
title = "Extreme-Scale High-Order {WENO} Simulations of {$3$-D}
Detonation Wave with 10 Million Cores",
journal = j-TACO,
volume = "15",
number = "2",
pages = "26:1--26:??",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209208",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-order stencil computations, frequently found in
many applications, pose severe challenges to emerging
many-core platforms due to the complexities of hardware
architectures as well as the sophisticated computing
and data movement patterns. In this article, we tackle
the challenges of high-order WENO computations in
extreme-scale simulations of 3D gaseous waves on Sunway
TaihuLight. We design efficient parallelization
algorithms and present effective optimization
techniques to fully exploit various parallelisms with
reduced memory footprints, enhanced data reuse, and
balanced computation load. Test results show the
optimized code can scale to 9.98 million cores, solving
12.74 trillion unknowns with 23.12 Pflops
double-precision performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sfakianakis:2018:QPB,
author = "Yannis Sfakianakis and Christos Kozanitis and Christos
Kozyrakis and Angelos Bilas",
title = "{QuMan}: Profile-based Improvement of Cluster
Utilization",
journal = j-TACO,
volume = "15",
number = "3",
pages = "27:1--27:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3210560",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern data centers consolidate workloads to increase
server utilization and reduce total cost of ownership,
and cope with scaling limitations. However, server
resource sharing introduces performance interference
across applications and, consequently, increases
performance volatility, which negatively affects user
experience. Thus, a challenging problem is to increase
server utilization while maintaining application QoS.
In this article, we present QuMan, a server resource
manager that uses application isolation and profiling
to increase server utilization while controlling
degradation of application QoS. Previous solutions,
either estimate interference across applications and
then restrict colocation to ``compatible''
applications, or assume that application requirements
are known. Instead, QuMan estimates the required
resources of applications. It uses an isolation
mechanism to create properly-sized resource slices for
applications, and arbitrarily colocates applications.
QuMan 's mechanisms can be used with a variety of
admission control policies, and we explore the
potential of two such policies: (1) A policy that
allows users to specify a minimum performance threshold
and (2) an automated policy, which operates without
user input and is based on a new combined
QoS-utilization metric. We implement QuMan on top of
Linux servers, and we evaluate its effectiveness using
containers and real applications. Our single-node
results show that QuMan balances highly effectively the
tradeoff between server utilization and application
performance, as it achieves 80\% server utilization
while the performance of each application does not drop
below 80\% the respective standalone performance. We
also deploy QuMan on a cluster of 100 AWS instances
that are managed by a modified version of the Sparrow
scheduler [37] and, we observe a 48\% increase in
application performance on a highly utilized cluster,
compared to the performance of the same cluster under
the same load when it is managed by native Sparrow or
Apache Mesos.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kayraklioglu:2018:LLA,
author = "Engin Kayraklioglu and Michael P. Ferguson and Tarek
El-Ghazawi",
title = "{LAPPS}: Locality-Aware Productive Prefetching Support
for {PGAS}",
journal = j-TACO,
volume = "15",
number = "3",
pages = "28:1--28:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3233299",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Prefetching is a well-known technique to mitigate
scalability challenges in the Partitioned Global
Address Space (PGAS) model. It has been studied as
either an automated compiler optimization or a manual
programmer optimization. Using the PGAS locality
awareness, we define a hybrid tradeoff. Specifically,
we introduce locality-aware productive prefetching
support for PGAS. Our novel, user-driven approach
strikes a balance between the ease-of-use of
compiler-based automated prefetching and the high
performance of the laborious manual prefetching. Our
prototype implementation in Chapel shows that
significant scalability and performance improvements
can be achieved with minimal effort in common
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Benatia:2018:BSM,
author = "Akrem Benatia and Weixing Ji and Yizhuo Wang and Feng
Shi",
title = "{BestSF}: a Sparse Meta-Format for Optimizing {SpMV}
on {GPU}",
journal = j-TACO,
volume = "15",
number = "3",
pages = "29:1--29:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226228",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Sparse Matrix-Vector Multiplication (SpMV) kernel
dominates the computing cost in numerous scientific
applications. Many implementations based on different
sparse formats were proposed to improve this kernel on
the recent GPU architectures. However, it has been
widely observed that there is no ``best-for-all''
sparse format for the SpMV kernel on GPU. Indeed,
serious performance degradation of an order of
magnitude can be observed without a careful selection
of the sparse format to use. To address this problem,
we propose in this article BestSF (Best Sparse Format),
a new learning-based sparse meta-format that
automatically selects the most appropriate sparse
format for a given input matrix. To do so, BestSF
relies on a cost-sensitive classification system
trained using Weighted Support Vector Machines (WSVMs)
to predict the best sparse format for each input sparse
matrix. Our experimental results on two different
NVIDIA GPU architectures using a large number of
real-world sparse matrices show that BestSF achieved a
noticeable overall performance improvement over using a
single sparse format. While BestSF is trained to select
the best sparse format in terms of performance
(GFLOPS), our further experimental investigations
revealed that using BestSF also led, in most of the
test cases, to the best energy efficiency (MFLOPS/W).
To prove its practical effectiveness, we also evaluate
the performance and energy efficiency improvement
achieved when using BestSF as a building block in a
GPU-based Preconditioned Conjugate Gradient (PCG)
iterative solver.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Michaud:2018:ATL,
author = "Pierre Michaud",
title = "An Alternative {TAGE}-like Conditional Branch
Predictor",
journal = j-TACO,
volume = "15",
number = "3",
pages = "30:1--30:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226098",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "TAGE is one of the most accurate conditional branch
predictors known today. However, TAGE does not exploit
its input information perfectly, as it is possible to
obtain significant prediction accuracy improvements by
complementing TAGE with a statistical corrector using
the same input information. This article proposes an
alternative TAGE-like predictor making statistical
correction practically superfluous.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Garland:2018:LCM,
author = "James Garland and David Gregg",
title = "Low Complexity Multiply-Accumulate Units for
Convolutional Neural Networks with Weight-Sharing",
journal = j-TACO,
volume = "15",
number = "3",
pages = "31:1--31:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3233300",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Convolutional neural networks (CNNs) are one of the
most successful machine-learning techniques for image,
voice, and video processing. CNNs require large amounts
of processing capacity and memory bandwidth. Hardware
accelerators have been proposed for CNNs that typically
contain large numbers of multiply-accumulate (MAC)
units, the multipliers of which are large in integrated
circuit (IC) gate count and power consumption.
``Weight-sharing'' accelerators have been proposed
where the full range of weight values in a trained CNN
are compressed and put into bins, and the bin index is
used to access the weight-shared value. We reduce power
and area of the CNN by implementing parallel accumulate
shared MAC (PASM) in a weight-shared CNN. PASM
re-architects the MAC to instead count the frequency of
each weight and place it in a bin. The accumulated
value is computed in a subsequent multiply phase,
significantly reducing gate count and power consumption
of the CNN. In this article, we implement PASM in a
weight-shared CNN convolution hardware accelerator and
analyze its effectiveness. Experiments show that for a
clock speed 1GHz implemented on a 45nm ASIC process our
approach results in fewer gates, smaller logic, and
reduced power with only a slight increase in latency.
We also show that the same weight-shared-with-PASM CNN
accelerator can be implemented in resource-constrained
FPGAs, where the FPGA has limited numbers of digital
signal processor (DSP) units to accelerate the MAC
operations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2018:CEC,
author = "Hyojong Kim and Ramyad Hadidi and Lifeng Nai and
Hyesoon Kim and Nuwan Jayasena and Yasuko Eckert and
Onur Kayiran and Gabriel Loh",
title = "{CODA}: Enabling Co-location of Computation and Data
for Multiple {GPU} Systems",
journal = j-TACO,
volume = "15",
number = "3",
pages = "32:1--32:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3232521",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To exploit parallelism and scalability of multiple
GPUs in a system, it is critical to place compute and
data together. However, two key techniques that have
been used to hide memory latency and improve
thread-level parallelism (TLP), memory interleaving,
and thread block scheduling, in traditional GPU systems
are at odds with efficient use of multiple GPUs.
Distributing data across multiple GPUs to improve
overall memory bandwidth utilization incurs high remote
traffic when the data and compute are misaligned.
Nondeterministic thread block scheduling to improve
compute resource utilization impedes co-placement of
compute and data. Our goal in this work is to enable
co-placement of compute and data in the presence of
fine-grained interleaved memory with a low-cost
approach. To this end, we propose a mechanism that
identifies exclusively accessed data and place the data
along with the thread block that accesses it in the
same GPU. The key ideas are (1) the amount of data
exclusively used by a thread block can be estimated,
and that exclusive data (of any size) can be localized
to one GPU with coarse-grained interleaved pages; (2)
using the affinity-based thread block scheduling
policy, we can co-place compute and data together; and
(3) by using dual address mode with lightweight changes
to virtual to physical page mappings, we can
selectively choose different interleaved memory pages
for each data structure. Our evaluations across a wide
range of workloads show that the proposed mechanism
improves performance by 31\% and reduces 38\% remote
traffic over a baseline system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Manivannan:2018:GDB,
author = "Madhavan Manivannan and Miquel Peric{\'a}s and
Vassilis Papaefstathiou and Per Stenstr{\"o}m",
title = "Global Dead-Block Management for Task-Parallel
Programs",
journal = j-TACO,
volume = "15",
number = "3",
pages = "33:1--33:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3234337",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Task-parallel programs inefficiently utilize the cache
hierarchy due to the presence of dead blocks in caches.
Dead blocks may occupy cache space in multiple cache
levels for a long time without providing any utility
until they are finally evicted. Existing dead-block
prediction schemes take decisions locally for each
cache level and do not efficiently manage the entire
cache hierarchy. This article introduces
runtime-orchestrated global dead-block management, in
which static and dynamic information about tasks
available to the runtime system is used to effectively
detect and manage dead blocks across the cache
hierarchy. In the proposed global management schemes,
static information (e.g., when tasks start/finish, and
what data regions tasks produce/consume) is combined
with dynamic information to detect when/where blocks
become dead. When memory regions are deemed dead at
some cache level(s), all the associated cache blocks
are evicted from the corresponding level(s). We extend
the cache controllers at both private and shared cache
levels to use the aforementioned information to evict
dead blocks. The article does an extensive evaluation
of both inclusive and non-inclusive cache hierarchies
and shows that the proposed global schemes outperform
existing local dead-block management schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gareev:2018:HPG,
author = "Roman Gareev and Tobias Grosser and Michael Kruse",
title = "High-Performance Generalized Tensor Operations: a
Compiler-Oriented Approach",
journal = j-TACO,
volume = "15",
number = "3",
pages = "34:1--34:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3235029",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The efficiency of tensor contraction is of great
importance. Compilers cannot optimize it well enough to
come close to the performance of expert-tuned
implementations. All existing approaches that provide
competitive performance require optimized external
code. We introduce a compiler optimization that reaches
the performance of optimized BLAS libraries without the
need for an external implementation or automatic
tuning. Our approach provides competitive performance
across hardware architectures and can be generalized to
deliver the same benefits for algebraic path problems.
By making fast linear algebra kernels available to
everyone, we expect productivity increases when
optimized libraries are not available.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yviquel:2018:CPU,
author = "Herv{\'e} Yviquel and Lauro Cruz and Guido Araujo",
title = "Cluster Programming using the {OpenMP} Accelerator
Model",
journal = j-TACO,
volume = "15",
number = "3",
pages = "35:1--35:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226112",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Computation offloading is a programming model in which
program fragments (e.g., hot loops) are annotated so
that their execution is performed in dedicated hardware
or accelerator devices. Although offloading has been
extensively used to move computation to GPUs, through
directive-based annotation standards like OpenMP,
offloading computation to very large computer clusters
can become a complex and cumbersome task. It typically
requires mixing programming models (e.g., OpenMP and
MPI) and languages (e.g., C/C++ and Scala), dealing
with various access control mechanisms from different
cloud providers (e.g., AWS and Azure), and integrating
all this into a single application. This article
introduces computer cluster nodes as simple OpenMP
offloading devices that can be used either from a local
computer or from the cluster head-node. It proposes a
methodology that transforms OpenMP directives to Spark
runtime calls with fully integrated communication
management, in a way that a cluster appears to the
programmer as yet another accelerator device.
Experiments using LLVM 3.8, OpenMP 4.5 on well known
cloud infrastructures (Microsoft Azure and Amazon EC2)
show the viability of the proposed approach, enable a
thorough analysis of its performance, and make a
comparison with an MPI implementation. The results show
that although data transfers can impose overheads,
cloud offloading from a local machine can still achieve
promising speedups for larger granularity: up to 115$
\times $ in 256 cores for the 2MM benchmark using 1GB
sparse matrices. In addition, the parallel
implementation of a complex and relevant scientific
application reveals a 80$ \times $ speedup on a 320
core machine when executed directly from the headnode
of the cluster.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tavana:2018:BCA,
author = "Mohammad Khavari Tavana and Amir Kavyan Ziabari and
David Kaeli",
title = "Block Cooperation: Advancing Lifetime of Resistive
Memories by Increasing Utilization of Error Correcting
Codes",
journal = j-TACO,
volume = "15",
number = "3",
pages = "36:1--36:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3243906",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Block-level cooperation is an endurance management
technique that operates on top of error correction
mechanisms to extend memory lifetimes. Once an error
recovery scheme fails to recover from faults in a data
block, the entire physical page associated with that
block is disabled and becomes unavailable to the
physical address space. To reduce the page waste caused
by early block failures, other blocks can be used to
support the failed block, working cooperatively to keep
it alive and extend the faulty page's lifetime. We
combine the proposed technique with existing error
recovery schemes, such as Error Correction Pointers
(ECP) and Aegis, to increase memory lifetimes. Block
cooperation is realized through metadata sharing in
ECP, where one data block shares its unused metadata
with another data block. When combined with Aegis,
block cooperation is realized through reorganizing data
layout, where blocks possessing few faults come to the
aid of failed blocks, bringing them back from the dead.
Our evaluation using Monte Carlo simulation shows that
block cooperation at a single level (or multiple
levels) on top of ECP and Aegis, boosts memory
lifetimes by 28\% (37\%) and 8\% (14\%) on average,
respectively. Furthermore, using trace-driven benchmark
evaluation shows that lifetime boost can reach to 68\%
(30\%) exploiting metadata sharing (or data layout
reorganization).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jin:2018:LCM,
author = "Hai Jin and Bo Liu and Wenbin Jiang and Yang Ma and
Xuanhua Shi and Bingsheng He and Shaofeng Zhao",
title = "Layer-Centric Memory Reuse and Data Migration for
Extreme-Scale Deep Learning on Many-Core
Architectures",
journal = j-TACO,
volume = "15",
number = "3",
pages = "37:1--37:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3243904",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Due to the popularity of Deep Neural Network (DNN)
models, we have witnessed extreme-scale DNN models with
the continued increase of the scale in terms of depth
and width. However, the extremely high memory
requirements for them make it difficult to run the
training processes on single many-core architectures
such as a Graphic Processing Unit (GPU), which compels
researchers to use model parallelism over multiple GPUs
to make it work. However, model parallelism always
brings very heavy additional overhead. Therefore,
running an extreme-scale model in a single GPU is
urgently required. There still exist several challenges
to reduce the memory footprint for extreme-scale deep
learning. To address this tough problem, we first
identify the memory usage characteristics for deep and
wide convolutional networks, and demonstrate the
opportunities for memory reuse at both the intra-layer
and inter-layer levels. We then present Layrub, a
runtime data placement strategy that orchestrates the
execution of the training process. It achieves
layer-centric reuse to reduce memory consumption for
extreme-scale deep learning that could not previously
be run on a single GPU. Experiments show that, compared
to the original Caffe, Layrub can cut down the memory
usage rate by an average of 58.2\% and by up to 98.9\%,
at the moderate cost of 24.1\% higher training
execution time on average. Results also show that
Layrub outperforms some popular deep learning systems
such as GeePS, vDNN, MXNet, and Tensorflow. More
importantly, Layrub can tackle extreme-scale deep
learning tasks. For example, it makes an extra-deep
ResNet with 1,517 layers that can be trained
successfully in one GPU with 12GB memory, while other
existing deep learning systems cannot.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Voitsechov:2018:SDT,
author = "Dani Voitsechov and Arslan Zulfiqar and Mark
Stephenson and Mark Gebhart and Stephen W. Keckler",
title = "Software-Directed Techniques for Improved {GPU}
Register File Utilization",
journal = j-TACO,
volume = "15",
number = "3",
pages = "38:1--38:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3243905",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Throughput architectures such as GPUs require
substantial hardware resources to hold the state of a
massive number of simultaneously executing threads.
While GPU register files are already enormous, reaching
capacities of 256KB per streaming multiprocessor (SM),
we find that nearly half of real-world applications we
examined are register-bound and would benefit from a
larger register file to enable more concurrent threads.
This article seeks to increase the thread occupancy and
improve performance of these register-bound
applications by making more efficient use of the
existing register file capacity. Our first technique
eagerly deallocates register resources during
execution. We show that releasing register resources
based on value liveness as proposed in prior states of
the art leads to unreliable performance and undue
design complexity. To address these deficiencies, our
article presents a novel compiler-driven approach that
identifies and exploits last use of a register name
(instead of the value contained within) to eagerly
release register resources. Furthermore, while previous
works have leveraged ``scalar'' and ``narrow'' operand
properties of a program for various optimizations,
their impact on thread occupancy has been relatively
unexplored. Our article evaluates the effectiveness of
these techniques in improving thread occupancy and
demonstrates that while any one approach may fail to
free very many registers, together they synergistically
free enough registers to launch additional parallel
work. An in-depth evaluation on a large suite of
applications shows that just our early register
technique outperforms previous work on dynamic register
allocation, and together these approaches, on average,
provide 12\% performance speedup (23\% higher thread
occupancy) on register bound applications not already
saturating other GPU resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2018:GTD,
author = "Huanxin Lin and Cho-Li Wang and Hongyuan Liu",
title = "{On-GPU} Thread-Data Remapping for Branch Divergence
Reduction",
journal = j-TACO,
volume = "15",
number = "3",
pages = "39:1--39:??",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3242089",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "General Purpose GPU computing (GPGPU) plays an
increasingly vital role in high performance computing
and other areas like deep learning. However, arising
from the SIMD execution model, the branch divergence
issue lowers efficiency of conditional branching on
GPUs, and hinders the development of GPGPU. To achieve
runtime on-the-spot branch divergence reduction, we
propose the first on-GPU thread-data remapping scheme.
Before kernel launching, our solution inserts codes
into GPU kernels immediately before each target branch
so as to acquire actual runtime divergence information.
GPU software threads can be remapped to datasets
multiple times during single kernel execution. We
propose two thread-data remapping algorithms that are
tailored to the GPU architecture. Effective on two
generations of GPUs from both NVIDIA and AMD, our
solution achieves speedups up to 2.718 with third-party
benchmarks. We also implement three GPGPU frontier
benchmarks from areas including computer vision,
algorithmic trading and data analytics. They are
hindered by more complex divergence coupled with
different memory access patterns, and our solution
works better than the traditional thread-data remapping
scheme in all cases. As a compiler-assisted runtime
solution, it can better reduce divergence for divergent
applications that gain little acceleration on GPUs for
the time being.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kronawitter:2019:PSS,
author = "Stefan Kronawitter and Christian Lengauer",
title = "Polyhedral Search Space Exploration in the
{ExaStencils} Code Generator",
journal = j-TACO,
volume = "15",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3274653",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Performance optimization of stencil codes requires
data locality improvements. The polyhedron model for
loop transformation is well suited for such
optimizations with established techniques, such as the
PLuTo algorithm and diamond tiling. However, in the
domain of our project ExaStencils, stencil codes, it
fails to yield optimal results. As an alternative, we
propose a new, optimized, multi-dimensional polyhedral
search space exploration and demonstrate its
effectiveness: we obtain better results than existing
approaches in several cases. We also propose how to
specialize the search for the domain of stencil codes,
which dramatically reduces the exploration effort
without significantly impairing performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2019:PTA,
author = "Jingheng Xu and Haohuan Fu and Wen Shi and Lin Gan and
Yuxuan Li and Wayne Luk and Guangwen Yang",
title = "Performance Tuning and Analysis for Stencil-Based
Applications on {POWER8} Processor",
journal = j-TACO,
volume = "15",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3264422",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article demonstrates an approach for combining
general tuning techniques with the POWER8 hardware
architecture through optimizing three representative
stencil benchmarks. Two typical real-world
applications, with kernels similar to those of the
winning programs of the Gordon Bell Prize 2016 and
2017, are employed to illustrate algorithm
modifications and a combination of hardware-oriented
tuning strategies with the application algorithms. This
work fills the gap between hardware capability and
software performance of the POWER8 processor, and
provides useful guidance for optimizing stencil-based
scientific applications on POWER systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2019:SSS,
author = "Jiajun Wang and Reena Panda and Lizy K. John",
title = "{SelSMaP}: a Selective Stride Masking Prefetching
Scheme",
journal = j-TACO,
volume = "15",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3274650",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Data prefetching, which intelligently loads data
closer to the processor before demands, is a popular
cache performance optimization technique to address the
increasing processor-memory performance gap. Although
prefetching concepts have been proposed for decades,
sophisticated system architecture and emerging
applications introduce new challenges. Large
instruction windows coupled with out-of-order execution
makes the program data access sequence distorted from a
cache perspective. Furthermore, big data applications
stress memory subsystems heavily with their large
working set sizes and complex data access patterns. To
address such challenges, this work proposes a
high-performance hardware prefetching scheme, SelSMaP.
SelSMaP is able to detect both regular and nonuniform
stride patterns by taking the minimum observed address
offset (called a reference stride) as a heuristic. A
stride masking is generated according to the reference
stride and is to filter out history accesses whose
pattern can be rephrased as uniform stride accesses.
Prefetching decision and prefetch degree are determined
based on the masking outcome. As SelSMaP prediction
logic does not rely on the chronological order of data
accesses or program counter information, it is able to
unveil the effect of out-of-order execution and
compiler optimization. We evaluated SelSMaP with
CloudSuite workloads and SPEC CPU2006 benchmarks.
SelSMaP achieves an average CloudSuite performance
improvement of 30\% over nonprefetching systems. With
one to two orders of magnitude less storage and much
less functional logic, SelSMaP outperforms the
highest-performing prefetcher by 8.6\% in CloudSuite
workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Su:2019:SSC,
author = "Xing Su and Xiangke Liao and Hao Jiang and Canqun Yang
and Jingling Xue",
title = "{SCP}: Shared Cache Partitioning for High-Performance
{GEMM}",
journal = j-TACO,
volume = "15",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3274654",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GEneral Matrix Multiply (GEMM) is the most fundamental
computational kernel routine in the BLAS library. To
achieve high performance, in-memory data must be
prefetched into fast on-chip caches before they are
used. Two techniques, software prefetching and data
packing, have been used to effectively exploit the
capability of on-chip least recent used (LRU) caches,
which are popular in traditional high-performance
processors used in high-end servers and supercomputers.
However, the market has recently witnessed a new
diversity in processor design, resulting in
high-performance processors equipped with shared caches
with non-LRU replacement policies. This poses a
challenge to the development of high-performance GEMM
in a multithreaded context. As several threads try to
load data into a shared cache simultaneously,
interthread cache conflicts will increase
significantly. We present a Shared Cache Partitioning
(SCP) method to eliminate interthread cache conflicts
in the GEMM routines, by partitioning a shared cache
into physically disjoint sets and assigning different
sets to different threads. We have implemented SCP in
the OpenBLAS library and evaluated it on Phytium 2000+,
a 64-core AArch64 processor with private LRU L1 caches
and shared pseudo-random L2 caches (per four-core
cluster). Our evaluation shows that SCP has effectively
reduced the conflict misses in both L1 and L2 caches in
a highly optimized GEMM implementation, resulting in an
improvement of its performance by 2.75\% to 6.91\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pereira:2019:SPS,
author = "Fernando Magno Quint{\~a}o Pereira and Guilherme
Vieira Leobas and Abdoulaye Gamati{\'e}",
title = "Static Prediction of Silent Stores",
journal = j-TACO,
volume = "15",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3280848",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A store operation is called ``silent'' if it writes in
memory a value that is already there. The ability to
detect silent stores is important, because they might
indicate performance bugs, might enable code
optimizations, and might reveal opportunities of
automatic parallelization, for instance. Silent stores
are traditionally detected via profiling tools. In this
article, we depart from this methodology and instead
explore the following question: is it possible to
predict silentness by analyzing the syntax of programs?
The process of building an answer to this question is
interesting in itself, given the stochastic nature of
silent stores, which depend on data and coding style.
To build such an answer, we have developed a
methodology to classify store operations in terms of
syntactic features of programs. Based on such features,
we develop different kinds of predictors, some of which
go much beyond what any trivial approach could achieve.
To illustrate how static prediction can be employed in
practice, we use it to optimize programs running on
nonvolatile memory systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Crago:2019:EMA,
author = "Neal C. Crago and Mark Stephenson and Stephen W.
Keckler",
title = "Exposing Memory Access Patterns to Improve Instruction
and Memory Efficiency in {GPUs}",
journal = j-TACO,
volume = "15",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3280851",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern computing workloads often have high memory
intensity, requiring high bandwidth access to memory.
The memory request patterns of these workloads vary and
include regular strided accesses and indirect
(pointer-based) accesses. Such applications require a
large number of address generation instructions and a
high degree of memory-level parallelism. This article
proposes new memory instructions that exploit strided
and indirect memory request patterns and improve
efficiency in GPU architectures. The new instructions
reduce address calculation instructions by offloading
addressing to dedicated hardware, and reduce
destructive memory request interference by grouping
related requests together. Our results show that we can
eliminate 33\% of dynamic instructions across 16 GPU
benchmarks. These improvements result in an overall
runtime improvement of 26\%, an energy reduction of
18\%, and a reduction in energy-delay product of
32\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2019:PPB,
author = "Feng Zhang and Jingling Xue",
title = "{Poker}: Permutation-Based {SIMD} Execution of
Intensive Tree Search by Path Encoding",
journal = j-TACO,
volume = "15",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3280850",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We introduce Poker, a permutation-based approach for
vectorizing multiple queries over B$^+$-trees. Our key
insight is to combine vector loads and
path-encoding-based permutations to alleviate memory
latency while keeping the number of key comparisons
needed for a query to a minimum. Implemented as a C++
template library, Poker represents a general-purpose
solution for vectorizing the queries over indexing
trees on multi-core processors equipped with SIMD
units. For a set of five representative benchmarks
evaluated with 24 configurations each, Poker
outperforms the state of the art by 2.11x with one
single thread and 2.28x with eight threads on an Intel
Broadwell processor that supports 256-bit AVX2, on
average. In addition, strip-mining queries will further
improve Poker's performance by 1.21x (with one single
thread) and 1.31x (with eight threads), on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Belleville:2019:ASP,
author = "Nicolas Belleville and Damien Courouss{\'e} and Karine
Heydemann and Henri-Pierre Charles",
title = "Automated Software Protection for the Masses Against
Side-Channel Attacks",
journal = j-TACO,
volume = "15",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3281662",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present an approach and a tool to answer the need
for effective, generic, and easily applicable
protections against side-channel attacks. The
protection mechanism is based on code polymorphism, so
that the observable behaviour of the protected
component is variable and unpredictable to the
attacker. Our approach combines lightweight specialized
runtime code generation with the optimization
capabilities of static compilation. It is extensively
configurable. Experimental results show that programs
secured by our approach present strong security levels
and meet the performance requirements of constrained
systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yu:2019:ITL,
author = "Chao Yu and Yuebin Bai and Qingxiao Sun and Hailong
Yang",
title = "Improving Thread-level Parallelism in {GPUs} Through
Expanding Register File to Scratchpad Memory",
journal = j-TACO,
volume = "15",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3280849",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern Graphic Processing Units (GPUs) have become
pervasive computing devices in datacenters due to their
high performance with massive thread level parallelism
(TLP). GPUs are equipped with large register files (RF)
to support fast context switch between massive threads
and scratchpad memory (SPM) to support inter-thread
communication within the cooperative thread array
(CTA). However, the TLP of GPUs is usually limited by
the inefficient resource management of register file
and scratchpad memory. This inefficiency also leads to
register file and scratchpad memory underutilization.
To overcome the above inefficiency, we propose a new
resource management approach EXPARS for GPUs. EXPARS
provides a larger register file logically by expanding
the register file to scratchpad memory. When the
available register file becomes limited, our approach
leverages the underutilized scratchpad memory to
support additional register allocation. Therefore, more
CTAs can be dispatched to SMs, which improves the GPU
utilization. Our experiments on representative
benchmark suites show that the number of CTAs
dispatched to each SM increases by 1.28$ \times $ on
average. In addition, our approach improves the GPU
resource utilization significantly, with the register
file utilization improved by 11.64\% and the scratchpad
memory utilization improved by 48.20\% on average. With
better TLP, our approach achieves 20.01\% performance
improvement on average with negligible energy
overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Orosa:2019:AAF,
author = "Lois Orosa and Rodolfo Azevedo and Onur Mutlu",
title = "{AVPP}: Address-first Value-next Predictor with Value
Prefetching for Improving the Efficiency of Load Value
Prediction",
journal = j-TACO,
volume = "15",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3239567",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Value prediction improves instruction level
parallelism in superscalar processors by breaking true
data dependencies. Although this technique can
significantly improve overall performance, most of the
state-of-the-art value prediction approaches require
high hardware cost, which is the main obstacle for its
wide adoption in current processors. To tackle this
issue, we revisit load value prediction as an efficient
alternative to the classical approaches that predict
all instructions. By speculating only on loads, the
pressure over shared resources (e.g., the Physical
Register File) and the predictor size can be
substantially reduced (e.g., more than 90\% reduction
compared to recent works). We observe that existing
value predictors cannot achieve very high performance
when speculating only on load instructions. To solve
this problem, we propose a new, accurate and low-cost
mechanism for predicting the values of load
instructions: the Address-first Value-next Predictor
with Value Prefetching (AVPP). The key idea of our
predictor is to predict the load address first (which,
we find, is much more predictable than the value) and
to use a small non-speculative Value Table (VT)-indexed
by the predicted address-to predict the value next. To
increase the coverage of AVPP, we aim to increase the
hit rate of the VT by predicting also the load address
of a future instance of the same load instruction and
prefetching its value in the VT. We show that AVPP is
relatively easy to implement, requiring only 2.5\% of
the area of a 32KB L1 data cache. We compare our
mechanism with five state-of-the-art value prediction
techniques, evaluated within the context of load value
prediction, in a relatively narrow out-of-order
processor. On average, our AVPP predictor achieves
11.2\% speedup and 3.7\% of energy savings over the
baseline processor, outperforming all the
state-of-the-art predictors in 16 of the 23 benchmarks
we evaluate. We evaluate AVPP implemented together with
different prefetching techniques, showing additive
performance gains (20\% average speedup). In addition,
we propose a new taxonomy to classify different value
predictor policies regarding predictor update,
predictor availability, and in-flight pending updates.
We evaluate these policies in detail.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2019:REU,
author = "Jun Zhang and Rui Hou and Wei Song and Sally A. Mckee
and Zhen Jia and Chen Zheng and Mingyu Chen and Lixin
Zhang and Dan Meng",
title = "{RAGuard}: an Efficient and User-Transparent Hardware
Mechanism against {ROP} Attacks",
journal = j-TACO,
volume = "15",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3280852",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/prng.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Control-flow integrity (CFI) is a general method for
preventing code-reuse attacks, which utilize benign
code sequences to achieve arbitrary code execution. CFI
ensures that the execution of a program follows the
edges of its predefined static Control-Flow Graph: any
deviation that constitutes a CFI violation terminates
the application. Despite decades of research effort,
there are still several implementation challenges in
efficiently protecting the control flow of function
returns (Return-Oriented Programming attacks). The set
of valid return addresses of frequently called
functions can be large and thus an attacker could bend
the backward-edge CFI by modifying an indirect branch
target to another within the valid return set. This
article proposes RAGuard, an efficient and
user-transparent hardware-based approach to prevent
Return-Oriented Programming attacks. RAGuard binds a
message authentication code (MAC) to each return
address to protect its integrity. To guarantee the
security of the MAC and reduce runtime overhead:
RAGuard (1) computes the MAC by encrypting the
signature of a return address with AES-128, (2)
develops a key management module based on a Physical
Unclonable Function (PUF) and a True Random Number
Generator (TRNG), and (3) uses a dedicated register to
reduce MACs' load and store operations of leaf
functions. We have evaluated our mechanism based on the
open-source LEON3 processor and the results show that
RAGuard incurs acceptable performance overhead and
occupies reasonable area.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2019:GGC,
author = "Ping Wang and Luke Mchale and Paul V. Gratz and Alex
Sprintson",
title = "{GenMatcher}: a Generic Clustering-Based Arbitrary
Matching Framework",
journal = j-TACO,
volume = "15",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3281663",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Packet classification methods rely upon packet
content/header matching against rules. Thus, throughput
of matching operations is critical in many networking
applications. Further, with the advent of Software
Defined Networking (SDN), efficient implementation of
software approaches to matching are critical for the
overall system performance. This article presents$^1$
GenMatcher, a generic, software-only, arbitrary
matching framework for fast, efficient searches. The
key idea of our approach is to represent arbitrary
rules with efficient prefix-based tries. To support
arbitrary wildcards, we rearrange bits within the rules
such that wildcards accumulate to one side of the
bitstring. Since many non-contiguous wildcards often
remain, we use multiple prefix-based tries. The main
challenge in this context is to generate efficient trie
groupings and expansions to support all arbitrary
rules. Finding an optimal mix of grouping and expansion
is an NP-complete problem. Our contribution includes a
novel, clustering-based grouping algorithm to group
rules based upon their bit-level similarities. Our
algorithm generates near-optimal trie groupings with
low configuration times and provides significantly
higher match throughput compared to prior techniques.
Experiments with synthetic traffic show that our method
can achieve a 58.9X speedup compared to the baseline on
a single core processor under a given memory
constraint.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hong:2019:PTG,
author = "Ding-Yong Hong and Jan-Jan Wu and Yu-Ping Liu and
Sheng-Yu Fu and Wei-Chung Hsu",
title = "Processor-Tracing Guided Region Formation in Dynamic
Binary Translation",
journal = j-TACO,
volume = "15",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3281664",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Region formation is an important step in dynamic
binary translation to select hot code regions for
translation and optimization. The quality of the formed
regions determines the extent of optimizations and thus
determines the final execution performance. Moreover,
the overall performance is very sensitive to the
formation overhead, because region formation can have a
non-trivial cost. For addressing the dual issues of
region quality and region formation overhead, this
article presents a lightweight region formation method
guided by processor tracing, e.g., Intel PT. We
leverage the branch history information stored in the
processor to reconstruct the program execution profile
and effectively form high-quality regions with low
cost. Furthermore, we present the designs of
lightweight hardware performance monitoring sampling
and the branch instruction decode cache to minimize
region formation overhead. Using ARM64 to x86-64
translations, the experiment results show that our
method achieves a performance speedup of up to 1.53$
\times $ (1.16$ \times $ on average) for SPEC CPU2006
benchmarks with reference inputs, compared to the
well-known software-based trace formation method, Next
Executing Tail (NET). The performance results of x86-64
to ARM64 translations also show a speedup of up to
1.25$ \times $ over NET for CINT2006 benchmarks with
reference inputs. The comparison with a relaxed NETPlus
region formation method further demonstrates that our
method achieves the best performance and lowest
compilation overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2019:PNW,
author = "Yu Wang and Victor Lee and Gu-Yeon Wei and David
Brooks",
title = "Predicting New Workload or {CPU} Performance by
Analyzing Public Datasets",
journal = j-TACO,
volume = "15",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3284127",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The marketplace for general-purpose microprocessors
offers hundreds of functionally similar models,
differing by traits like frequency, core count, cache
size, memory bandwidth, and power consumption. Their
performance depends not only on microarchitecture, but
also on the nature of the workloads being executed.
Given a set of intended workloads, the consumer needs
both performance and price information to make rational
buying decisions. Many benchmark suites have been
developed to measure processor performance, and their
results for large collections of CPUs are often
publicly available. However, repositories of benchmark
results are not always helpful when consumers need
performance data for new processors or new workloads.
Moreover, the aggregate scores for benchmark suites
designed to cover a broad spectrum of workload types
can be misleading. To address these problems, we have
developed a deep neural network (DNN) model, and we
have used it to learn the relationship between the
specifications of Intel CPUs and their performance on
the SPEC CPU2006 and Geekbench 3 benchmark suites. We
show that we can generate useful predictions for new
processors and new workloads. We also cross-predict the
two benchmark suites and compare their performance
scores. The results quantify the self-similarity of
these suites for the first time in the literature. This
work should discourage consumers from basing purchasing
decisions exclusively on Geekbench 3, and it should
encourage academics to evaluate research using more
diverse workloads than the SPEC CPU suites alone.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Park:2019:ROC,
author = "Hyukwoo Park and Sungkook Kim and Jung-Geun Park and
Soo-Mook Moon",
title = "Reusing the Optimized Code for {JavaScript}
Ahead-of-Time Compilation",
journal = j-TACO,
volume = "15",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291056",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As web pages and web apps increasingly include heavy
JavaScript code, JavaScript performance has been a
critical issue. Modern JavaScript engines achieve a
remarkable performance by employing tiered-execution
architecture based on interpreter, baseline
just-in-time compiler (JITC), and optimizing JITC.
Unfortunately, they suffer from a substantial
compilation overhead, which can take more than 50\% of
the whole running time. A simple idea to reduce the
compilation overhead is ahead-of-time compilation
(AOTC), which reuses the code generated in the previous
run. In fact, existing studies that reuse the bytecode
generated by the interpreter or the machine code
generated by the baseline JITC have shown tangible
performance benefits [12, 31, 41]. However, there has
been no study to reuse the machine code generated by
the optimizing JITC, which heavily uses profile-based
optimizations, thus not easily reusable. We propose a
novel AOTC that can reuse the optimized machine code
for high-performance JavaScript engines. Unlike
previous AOTCs, we need to resolve a few challenging
issues related to reusing profile-based optimized code
and relocating dynamic addresses. Our AOTC improves the
performance of a commercial JavaScript engine by 6.36
times (max) and 1.99 times (average) for Octane
benchmarks, by reducing the compilation overhead and by
running the optimized code from the first invocation of
functions. It also improves the loading time of six web
apps by 1.28 times, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2019:BLA,
author = "Han Zhao and Quan Chen and Yuxian Qiu and Ming Wu and
Yao Shen and Jingwen Leng and Chao Li and Minyi Guo",
title = "Bandwidth and Locality Aware Task-stealing for
Manycore Architectures with Bandwidth-Asymmetric
Memory",
journal = j-TACO,
volume = "15",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291058",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallel computers now start to adopt
Bandwidth-Asymmetric Memory architecture that consists
of traditional DRAM memory and new High Bandwidth
Memory (HBM) for high memory bandwidth. However,
existing task schedulers suffer from low bandwidth
usage and poor data locality problems in
bandwidth-asymmetric memory architectures. To solve the
two problems, we propose a Bandwidth and Locality Aware
Task-stealing (BATS) system, which consists of an
HBM-aware data allocator, a bandwidth-aware traffic
balancer, and a hierarchical task-stealing scheduler.
Leveraging compile-time code transformation and
run-time data distribution, the data allocator enables
HBM usage automatically without user interference.
According to data access hotness, the traffic balancer
migrates data to balance memory traffic across memory
nodes proportional to their bandwidth. The hierarchical
scheduler improves data locality at runtime without a
priori program knowledge. Experiments on an Intel
Knights Landing server that adopts bandwidth-asymmetric
memory show that BATS reduces the execution time of
memory-bound programs up to 83.5\% compared with
traditional task-stealing schedulers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ganser:2019:SIP,
author = "Stefan Ganser and Armin Gr{\"o}{\ss}linger and Norbert
Siegmund and Sven Apel and Christian Lengauer",
title = "Speeding up Iterative Polyhedral Schedule Optimization
with Surrogate Performance Models",
journal = j-TACO,
volume = "15",
number = "4",
pages = "56:1--56:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291773",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Iterative program optimization is known to be able to
adapt more easily to particular programs and target
hardware than model-based approaches. An approach is to
generate random program transformations and evaluate
their profitability by applying them and benchmarking
the transformed program on the target hardware. This
procedure's large computational effort impairs its
practicality tremendously, though. To address this
limitation, we pursue the guidance of a genetic
algorithm for program optimization via feedback from
surrogate performance models. We train the models on
program transformations that were evaluated during
previous iterative optimizations. Our representation of
programs and program transformations refers to the
polyhedron model. The representation is particularly
meaningful for an optimization of loop programs that
profit a from coarse-grained parallelization for
execution on modern multicore-CPUs. Our evaluation
reveals that surrogate performance models can be used
to speed up the optimization of loop programs. We
demonstrate that we can reduce the benchmarking effort
required for an iterative optimization and degrade the
resulting speedups by an average of 15\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2019:DPC,
author = "Song Wu and Fang Zhou and Xiang Gao and Hai Jin and
Jinglei Ren",
title = "Dual-Page Checkpointing: an Architectural Approach to
Efficient Data Persistence for In-Memory Applications",
journal = j-TACO,
volume = "15",
number = "4",
pages = "57:1--57:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291057",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Data persistence is necessary for many in-memory
applications. However, the disk-based data persistence
largely slows down in-memory applications. Emerging
non-volatile memory (NVM) offers an opportunity to
achieve in-memory data persistence at the DRAM-level
performance. Nevertheless, NVM typically requires a
software library to operate NVM data, which brings
significant overhead. This article demonstrates that a
hardware-based high-frequency checkpointing mechanism
can be used to achieve efficient in-memory data
persistence on NVM. To maintain checkpoint consistency,
traditional logging and copy-on-write techniques incur
excessive NVM writes that impair both performance and
endurance of NVM; recent work attempts to solve the
issue but requires a large amount of metadata in the
memory controller. Hence, we design a new dual-page
checkpointing system, which achieves low metadata cost
and eliminates most excessive NVM writes at the same
time. It breaks the traditional trade-off between
metadata space cost and extra data writes. Our solution
outperforms the state-of-the-art NVM software libraries
by 13.6$ \times $ in throughput, and leads to 34\% less
NVM wear-out and 1.28$ \times $ higher throughput than
state-of-the-art hardware checkpointing solutions,
according to our evaluation with OLTP, graph computing,
and machine-learning workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kiani:2019:ECP,
author = "Mohsen Kiani and Amir Rajabzadeh",
title = "Efficient Cache Performance Modeling in {GPUs} Using
Reuse Distance Analysis",
journal = j-TACO,
volume = "15",
number = "4",
pages = "58:1--58:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291051",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Reuse distance analysis (RDA) is a popular method for
calculating locality profiles and modeling cache
performance. The present article proposes a framework
to apply the RDA algorithm to obtain reuse distance
profiles in graphics processing unit (GPU) kernels. To
study the implications of hardware-related parameters
in RDA, two RDA algorithms were employed, including a
high-level cache-independent RDA algorithm, called
HLRDA, and a detailed RDA algorithm, called DRDA. DRDA
models the effects of reservation fails in cache blocks
and miss status holding registers to provide accurate
cache-related performance metrics. In this case, the
reuse profiles are cache-specific. In a selection of
GPU kernels, DRDA obtained the L1 miss-rate breakdowns
with an average error of 3.86\% and outperformed the
state-of-the-art RDA in terms of accuracy. In terms of
performance, DRDA is 246,000$ \times $ slower than the
real GPU executions and 11$ \times $ faster than
GPGPU-Sim. HLRDA ignores the cache-related parameters
and its obtained reuse profiles are general, which can
be used to calculate miss rates in all cache sizes.
Moreover, the average error incurred by HLRDA was
16.9\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Debrunner:2019:AAK,
author = "Thomas Debrunner and Sajad Saeedi and Paul H. J.
Kelly",
title = "{AUKE}: Automatic Kernel Code Generation for an
Analogue {SIMD} Focal-Plane Sensor-Processor Array",
journal = j-TACO,
volume = "15",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291055",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Focal-plane Sensor-Processor Arrays (FPSPs) are new
imaging devices with parallel Single Instruction
Multiple Data (SIMD) computational capabilities built
into every pixel. Compared to traditional imaging
devices, FPSPs allow for massive pixel-parallel
execution of image processing algorithms. This enables
the application of certain algorithms at extreme frame
rates ({$>$10},000 frames per second). By performing
some early-stage processing in-situ, systems
incorporating FPSPs can consume less power compared to
conventional approaches using standard digital cameras.
In this article, we explore code generation for an FPSP
whose 256 $ \times $ 256 processors operate on analogue
signal data, leading to further opportunities for power
reduction-and additional code synthesis challenges.
While rudimentary image processing algorithms have been
demonstrated on FPSPs before, progress with
higher-level computer vision algorithms has been sparse
due to the unique architecture and limits of the
devices. This article presents a code generator for
convolution filters for the SCAMP-5 FPSP, with
applications in many high-level tasks such as
convolutional neural networks, pose estimation, and so
on. The SCAMP-5 FPSP has no effective multiply
operator. Convolutions have to be implemented through
sequences of more primitive operations such as
additions, subtractions, and multiplications/divisions
by two. We present a code generation algorithm to
optimise convolutions by identifying common factors in
the different weights and by determining an optimised
pattern of pixel-to-pixel data movements to exploit
them. We present evaluation in terms of both speed and
energy consumption for a suite of well-known
convolution filters. Furthermore, an application of the
method is shown by the implementation of a Viola-Jones
face detection algorithm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2019:SNS,
author = "You Zhou and Fei Wu and Zhonghai Lu and Xubin He and
Ping Huang and Changsheng Xie",
title = "{SCORE}: a Novel Scheme to Efficiently Cache Overlong
{ECCs} in {NAND} Flash Memory",
journal = j-TACO,
volume = "15",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291052",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Technology scaling and program/erase cycling result in
an increasing bit error rate in NAND flash storage.
Some solid state drives (SSDs) adopt overlong error
correction codes (ECCs), whose redundancy size exceeds
the spare area limit of flash pages, to protect user
data for improved reliability and lifetime. However,
the read performance is significantly degraded, because
a logical data page and its ECC redundancy are stored
in two flash pages. In this article, we find that
caching ECCs has a large potential to reduce flash
reads by achieving higher hit rates, compared to
caching data. Then, we propose a novel {$<$
underline$>$ s$<$}/{underline$>$ cheme} to efficiently
{$<$ underline$>$ c$<$}/{underline$>$ ache} {$<$
underline$>$ o$<$}/{underline$>$ ve$<$ underline$>$
r$<$}/{underline$>$ long} {$<$
underline$>$E$<$}/{underline$>$CCs}, called SCORE, to
improve the SSD performance. Exceeding ECC redundancy
(called ECC residues ) of logically consecutive data
pages are grouped into ECC pages. SCORE partitions RAM
to cache both data pages and ECC pages in a
workload-adaptive manner. Finally, we verify SCORE
using extensive trace-driven simulations. The results
show that SCORE obtains high ECC hit rates without
sacrificing data hit rates, thus improving the read
performance by an average of 22\% under various
workloads, compared to the state-of-the-art schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Andujar:2019:PPA,
author = "Franciso J. And{\'u}jar and Salvador Coll and Marina
Alonso and Pedro L{\'o}pez and Juan-Miguel
Mart{\'\i}nez",
title = "{POWAR}: Power-Aware Routing in {HPC} Networks with
On\slash Off Links",
journal = j-TACO,
volume = "15",
number = "4",
pages = "61:1--61:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3293445",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In order to save energy in HPC interconnection
networks, one usual proposal is to switch idle links
into a low-power mode after a certain time without any
transmission, as IEEE Energy Efficient Ethernet
standard proposes. Extending the low-power mode
mechanism, we propose POWer-Aware Routing (POWAR), a
simple power-aware routing and selection function for
fat-tree and torus networks. POWAR adapts the amount of
network links that can be used, taking into account the
network load, and obtaining great energy savings in the
network (55\%--65\%) and the entire system (9\%--10\%)
with negligible performance overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mammadli:2019:AGD,
author = "Rahim Mammadli and Felix Wolf and Ali Jannesari",
title = "The Art of Getting Deep Neural Networks in Shape",
journal = j-TACO,
volume = "15",
number = "4",
pages = "62:1--62:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291053",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Training a deep neural network (DNN) involves
selecting a set of hyperparameters that define the
network topology and influence the accuracy of the
resulting network. Often, the goal is to maximize
prediction accuracy on a given dataset. However,
non-functional requirements of the trained network ---
such as inference speed, size, and energy consumption
--- can be very important as well. In this article, we
aim to automate the process of selecting an appropriate
DNN topology that fulfills both functional and
non-functional requirements of the application.
Specifically, we focus on tuning two important
hyperparameters, depth and width, which together define
the shape of the resulting network and directly affect
its accuracy, speed, size, and energy consumption. To
reduce the time needed to search the design space, we
train a fraction of DNNs and build a model to predict
the performances of the remaining ones. We are able to
produce tuned ResNets, which are up to 4.22 times
faster than original depth-scaled ResNets on a batch of
128 images while matching their accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tzilis:2019:EER,
author = "Stavros Tzilis and Pedro Trancoso and Ioannis
Sourdis",
title = "Energy-Efficient Runtime Management of Heterogeneous
Multicores using Online Projection",
journal = j-TACO,
volume = "15",
number = "4",
pages = "63:1--63:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3293446",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Heterogeneous multicores offer flexibility in the form
of different core types and Dynamic Voltage and
Frequency Scaling (DVFS), defining a vast configuration
space. The optimal configuration choice is not always
straightforward, even for single applications, and
becomes a very difficult problem for dynamically
changing scenarios of concurrent applications with
unpredictable spawn and termination times and
individual performance requirements. This article
proposes an integrated approach for runtime decision
making for energy efficiency on such systems. The
approach consists of a model that predicts performance
and power for any possible decision and low-complexity
heuristics that use this model to evaluate a subset of
possible decisions to choose the best. The model
predicts performance by projecting standalone
application profiling data to the current status of the
system and power by using a set of platform-specific
parameters that are determined only once for a given
system and are independent of the application mix. Our
approach is evaluated with a plethora of dynamic,
multi-application scenarios. When considering best
effort performance to be adequate, our runtime achieves
on average 3\% higher energy efficiency compared to the
powersave governor and 2$ \times $ better compared to
the other Linux governors. Moreover, when also
considering individual applications' performance
requirements, our runtime is able to satisfy them,
giving away 18\% of the system's energy efficiency
compared to the powersave, which, however, misses the
performance targets by 23\%; at the same time, our
runtime maintains an efficiency advantage of about 55\%
compared to the other governors, which also satisfy the
performance constraints.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2019:SLS,
author = "Matthew Kay Fei Lee and Yingnan Cui and Thannirmalai
Somu and Tao Luo and Jun Zhou and Wai Teng Tang and
Weng-Fai Wong and Rick Siow Mong Goh",
title = "A System-Level Simulator for {RRAM}-Based Neuromorphic
Computing Chips",
journal = j-TACO,
volume = "15",
number = "4",
pages = "64:1--64:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291054",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Advances in non-volatile resistive switching random
access memory (RRAM) have made it a promising memory
technology with potential applications in low-power and
embedded in-memory computing devices owing to a number
of advantages such as low-energy consumption, low area
cost and good scaling. There have been proposals to
employ RRAM in architecting chips for neuromorphic
computing and artificial neural networks where
matrix-vector multiplication can be computed in the
analog domain in a single timestep. However, it is
challenging to employ RRAM devices in neuromorphic
chips owing to the non-ideal behavior of RRAM. In this
article, we propose a cycle-accurate and scalable
system-level simulator that can be used to study the
effects of using RRAM devices in neuromorphic computing
chips. The simulator models a spatial neuromorphic chip
architecture containing many neural cores with RRAM
crossbars connected via a Network-on-Chip (NoC). We
focus on system-level simulation and demonstrate the
effectiveness of our simulator in understanding how
non-linear RRAM effects such as stuck-at-faults (SAFs),
write variability, and random telegraph noise (RTN) can
impact an application's behavior. By using our
simulator, we show that RTN and write variability can
have adverse effects on an application. Nevertheless,
we show that these effects can be mitigated through
proper design choices and the implementation of a
write-verify scheme.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vasilakis:2019:DFC,
author = "Evangelos Vasilakis and Vassilis Papaefstathiou and
Pedro Trancoso and Ioannis Sourdis",
title = "Decoupled Fused Cache: Fusing a Decoupled {LLC} with a
{DRAM} Cache",
journal = j-TACO,
volume = "15",
number = "4",
pages = "65:1--65:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3293447",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "DRAM caches have shown excellent potential in
capturing the spatial and temporal data locality of
applications capitalizing on advances of 3D-stacking
technology; however, they are still far from their
ideal performance. Besides the unavoidable DRAM access
to fetch the requested data, tag access is in the
critical path, adding significant latency and energy
costs. Existing approaches are not able to remove these
overheads and in some cases limit DRAM cache design
options. For instance, caching DRAM cache tags adds
constant latency to every access; accessing the DRAM
cache using the TLB calls for OS support and DRAM
cachelines as large as a page; reusing the last-level
cache (LLC) tags to access the DRAM cache limits LLC
performance as it requires indexing the LLC using
higher-order address bits. In this article, we
introduce Decoupled Fused Cache, a DRAM cache design
that alleviates the cost of tag accesses by fusing DRAM
cache tags with the tags of the on-chip LLC without
affecting LLC performance. In essence, the Decoupled
Fused Cache relies in most cases on the LLC tag access
to retrieve the required information for accessing the
DRAM cache while avoiding additional overheads.
Compared to current DRAM cache designs of the same
cacheline size, Decoupled Fused Cache improves system
performance by 6\% on average and by 16\% to 18\% for
large cacheline sizes. Finally, Decoupled Fused Cache
reduces DRAM cache traffic by 18\% and DRAM cache
energy consumption by 7\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "65",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pirkelbauer:2019:BTF,
author = "Peter Pirkelbauer and Amalee Wilson and Christina
Peterson and Damian Dechev",
title = "{Blaze-Tasks}: a Framework for Computing Parallel
Reductions over Tasks",
journal = j-TACO,
volume = "15",
number = "4",
pages = "66:1--66:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3293448",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compared to threads, tasks are a more fine-grained
alternative. The task parallel programming model offers
benefits in terms of better performance portability and
better load-balancing for problems that exhibit
nonuniform workloads. A common scenario of task
parallel programming is that a task is recursively
decomposed into smaller sub-tasks. Depending on the
problem domain, the number of created sub-tasks may be
nonuniform, thereby creating potential for significant
load imbalances in the system. Dynamic load-balancing
mechanisms will distribute the tasks across available
threads. The final result of a computation may be
modeled as a reduction over the results of all
sub-tasks. This article describes a simple, yet
effective prototype framework, Blaze-Tasks, for task
scheduling and task reductions on shared memory
architectures. The framework has been designed with
lock-free techniques and generic programming principles
in mind. Blaze-Tasks is implemented entirely in C++17
and is thus portable. To load-balance the computation,
Blaze-Tasks uses task stealing. To manage contention on
a task pool, the number of lock-free attempts to steal
a task depends on the distance between thief and pool
owner and the estimated number of tasks in a victim's
pool. This article evaluates the Blaze framework on
Intel and IBM dual-socket systems using nine benchmarks
and compares its performance with other task parallel
frameworks. While Cilk outperforms Blaze on Intel on
most benchmarks, the evaluation shows that Blaze is
competitive with OpenMP and other library-based
implementations. On IBM, the experiments show that
Blaze outperforms other approaches on most
benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "66",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sato:2019:AFS,
author = "Yukinori Sato and Tomoya Yuki and Toshio Endo",
title = "An Autotuning Framework for Scalable Execution of
Tiled Code via Iterative Polyhedral Compilation",
journal = j-TACO,
volume = "15",
number = "4",
pages = "67:1--67:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3293449",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "On modern many-core CPUs, performance tuning against
complex memory subsystems and scalability for
parallelism is mandatory to achieve their potential. In
this article, we focus on loop tiling, which plays an
important role in performance tuning, and develop a
novel framework that analytically models the load
balance and empirically autotunes unpredictable cache
behaviors through iterative polyhedral compilation
using LLVM/Polly. From an evaluation on many-core CPUs,
we demonstrate that our autotuner achieves a
performance superior to those that use conventional
static approaches and well-known autotuning heuristics.
Moreover, our autotuner achieves almost the same
performance as a brute-force search-based approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "67",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shekofteh:2019:MSG,
author = "S.-Kazem Shekofteh and Hamid Noori and Mahmoud
Naghibzadeh and Hadi Sadoghi Yazdi and Holger
Fr{\"o}ning",
title = "Metric Selection for {GPU} Kernel Classification",
journal = j-TACO,
volume = "15",
number = "4",
pages = "68:1--68:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3295690",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphics Processing Units (GPUs) are vastly used for
running massively parallel programs. GPU kernels
exhibit different behavior at runtime and can usually
be classified in a simple form as either
``compute-bound'' or ``memory-bound.'' Recent GPUs are
capable of concurrently running multiple kernels, which
raises the question of how to most appropriately
schedule kernels to achieve higher performance. In
particular, co-scheduling of compute-bound and
memory-bound kernels seems promising. However, its
benefits as well as drawbacks must be determined along
with which kernels should be selected for a concurrent
execution. Classifying kernels can be performed online
by instrumentation based on performance counters. This
work conducts a thorough analysis of the metrics
collected from various benchmarks from Rodinia and CUDA
SDK. The goal is to find the minimum number of
effective metrics that enables online classification of
kernels with a low overhead. This study employs a
wrapper-based feature selection method based on the
Fisher feature selection criterion. The results of
experiments show that to classify kernels with a high
accuracy, only three and five metrics are sufficient on
a Kepler and a Pascal GPU, respectively. The proposed
method is then utilized for a runtime scheduler. The
results show an average speedup of 1.18$ \times $ and
1.1$ \times $ compared with a serial and a random
scheduler, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "68",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bilas:2019:LDR,
author = "Angelos Bilas",
title = "List of 2018 Distinguished Reviewers {ACM TACO}",
journal = j-TACO,
volume = "15",
number = "4",
pages = "69:1--69:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3293444",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "69",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shobaki:2019:EAC,
author = "Ghassan Shobaki and Austin Kerbow and Christopher
Pulido and William Dobson",
title = "Exploring an Alternative Cost Function for
Combinatorial Register-Pressure-Aware Instruction
Scheduling",
journal = j-TACO,
volume = "16",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301489",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 11 19:00:20 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multiple combinatorial algorithms have been proposed
for doing pre-allocation instruction scheduling with
the objective of minimizing register pressure or
balancing register pressure and instruction-level
parallelism. The cost function that is minimized in
most of these algorithms is the peak register pressure
(or the peak excess register pressure). In this work,
we explore an alternative register-pressure cost
function, which is the Sum of Live Interval Lengths
(SLIL). Unlike the peak cost function, which captures
register pressure only at the highest pressure point in
the schedule, the proposed SLIL cost function captures
register pressure at all points in the schedule.
Minimizing register pressure at all points is desirable
in larger scheduling regions with multiple
high-pressure points. This article describes a
Branch-and-Bound (B8B) algorithm for minimizing the
SLIL cost function. The algorithm is based on two
SLIL-specific dynamic lower bounds as well as the
history utilization technique proposed in our previous
work. The proposed algorithm is implemented into the
LLVM Compiler and evaluated experimentally relative to
our previously proposed B8B algorithm for minimizing
the peak excess register pressure. The experimental
results show that the proposed algorithm for minimizing
the SLIL cost function produces substantially less
spilling than the previous algorithm that minimizes the
peak cost function. Execution-time results on various
processors show that the proposed B8B algorithm
significantly improves the performance of many CPU2006
benchmarks by up to 49\% relative to LLVM's default
scheduler. The geometric-mean improvement for FP2006 on
Intel Core i7 is 4.22\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2019:ESA,
author = "Yu-Ping Liu and Ding-Yong Hong and Jan-Jan Wu and
Sheng-Yu Fu and Wei-Chung Hsu",
title = "Exploiting {SIMD} Asymmetry in {ARM}-to-x86 Dynamic
Binary Translation",
journal = j-TACO,
volume = "16",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301488",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 11 19:00:20 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single instruction multiple data (SIMD) has been
adopted for decades because of its superior performance
and power efficiency. The SIMD capability (i.e., width,
number of registers, and advanced instructions) has
diverged rapidly on different SIMD instruction-set
architectures (ISAs). Therefore, migrating existing
applications to another host ISA that has fewer but
longer SIMD registers and more advanced instructions
raises the issues of asymmetric SIMD capability. To
date, this issue has been overlooked and the host SIMD
capability is underutilized, resulting in suboptimal
performance. In this article, we present a novel binary
translation technique called spill-aware superword
level parallelism (saSLP), which combines short ARMv8
instructions and registers in the guest binaries to
exploit the x86 AVX2 host's parallelism, register
capacity, and gather instructions. Our experiment
results show that saSLP improves the performance by
1.6$ \times $ (2.3$ \times $) across a number of
benchmarks and reduces spilling by 97\% (99\%) for
ARMv8 to x86 AVX2 (AVX-512) translation. Furthermore,
with AVX2 (AVX-512) gather instructions, saSLP speeds
up several data-irregular applications that cannot be
vectorized on ARMv8 NEON by up to 3.9$ \times $ (4.2$
\times $).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sadrosadati:2019:IIT,
author = "Mohammad Sadrosadati and Seyed Borna Ehsani and Hajar
Falahati and Rachata Ausavarungnirun and Arash Tavakkol
and Mojtaba Abaee and Lois Orosa and Yaohua Wang and
Hamid Sarbazi-Azad and Onur Mutlu",
title = "{ITAP}: Idle-Time-Aware Power Management for {GPU}
Execution Units",
journal = j-TACO,
volume = "16",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291606",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 11 19:00:20 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphics Processing Units (GPUs) are widely used as
the accelerator of choice for applications with
massively data-parallel tasks. However, recent studies
show that GPUs suffer heavily from resource
underutilization, which, combined with their large
static power consumption, imposes a significant power
overhead. One of the most power-hungry components of a
GPU-the execution units-frequently experience idleness
when (1) an underutilized warp is issued to the
execution units, leading to partial lane idleness, and
(2) there is no active warp to be issued for the
execution due to warp stalls (e.g., waiting for memory
access and synchronization). Although large in total,
the idle time of execution units actually comes from
short but frequent stalls, leaving little potential for
common power saving techniques, such as power-gating.
In this article, we propose ITAP, a novel
idle-time-aware power management technique, which aims
to effectively reduce the static energy consumption of
GPU execution units. By taking advantage of different
power management techniques (i.e., power-gating and
different levels of voltage scaling), ITAP employs
three static power reduction modes with different
overheads and capabilities of static power reduction.
ITAP estimates the idle period length of execution
units using prediction and peek-ahead techniques in a
synergistic way and then applies the most appropriate
static power reduction mode based on the estimated idle
period length. We design ITAP to be power-aggressive or
performance-aggressive, not both at the same time. Our
experimental results on several workloads show that the
power-aggressive design of ITAP outperforms the
state-of-the-art solution by an average of 27.6\% in
terms of static energy savings, with less than 2.1\%
performance overhead. However, the
performance-aggressive design of ITAP improves the
static energy savings by an average of 16.9\%, while
keeping the GPU performance almost unaffected (i.e., up
to 0.4\% performance overhead) compared to the
state-of-the-art static energy savings mechanism.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dogan:2019:ASU,
author = "Halit Dogan and Masab Ahmad and Brian Kahne and Omer
Khan",
title = "Accelerating Synchronization Using Moving Compute to
Data Model at 1,000-core Multicore Scale",
journal = j-TACO,
volume = "16",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3300208",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 11 19:00:20 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Thread synchronization using shared memory hardware
cache coherence paradigm is prevalent in multicore
processors. However, as the number of cores increase on
a chip, cache line ping-pong prevents performance
scaling for algorithms that deploy fine-grain
synchronization. This article proposes an in-hardware
moving computation to data model (MC) that pins shared
data at dedicated cores. The critical code sections are
serialized and executed at these cores in a spatial
setting to enable data locality optimizations.
In-hardware messages enable non-blocking and blocking
communication between cores, without involving the
cache coherence protocol. The in-hardware MC model is
implemented on Tilera Tile-Gx72 multicore platform to
evaluate 8- to 64-core count scale. A simulated RISC-V
multicore environment is built to further evaluate the
performance scaling advantages of the MC model at
1,024-cores scale. The evaluation using graph and
machine-learning benchmarks illustrates that atomic
instructions based synchronization scales up to 512
cores, and the MC model at the same core count
outperforms by 27\% in completion time and 39\% in
dynamic energy consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Azriel:2019:MSP,
author = "Leonid Azriel and Lukas Humbel and Reto Achermann and
Alex Richardson and Moritz Hoffmann and Avi Mendelson
and Timothy Roscoe and Robert N. M. Watson and Paolo
Faraboschi and Dejan Milojicic",
title = "Memory-Side Protection With a Capability Enforcement
Co-Processor",
journal = j-TACO,
volume = "16",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3302257",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 11 19:00:20 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Byte-addressable nonvolatile memory (NVM) blends the
concepts of storage and memory and can radically
improve data-centric applications, from in-memory
databases to graph processing. By enabling
large-capacity devices to be shared across multiple
computing elements, fabric-attached NVM changes the
nature of rack-scale systems and enables short-latency
direct memory access while retaining data persistence
properties and simplifying the software stack. An
adequate protection scheme is paramount when addressing
shared and persistent memory, but mechanisms that rely
on virtual memory paging suffer from the tension
between performance (pushing toward large pages) and
protection granularity (pushing toward small pages). To
address this tension, capabilities are worth revisiting
as a more powerful protection mechanism, but the long
time needed to introduce new CPU features hampers the
adoption of schemes that rely on instruction-set
architecture support. This article proposes the
Capability Enforcement Co-Processor (CEP), a
programmable memory controller that implements
fine-grain protection through the capability model
without requiring instruction-set support in the
application CPU. CEP decouples capabilities from the
application CPU instruction-set architecture, shortens
time to adoption, and can rapidly evolve to embrace new
persistent memory technologies, from NVDIMMs to native
NVM devices, either locally connected or fabric
attached in rack-scale configurations. CEP exposes an
application interface based on memory handles that get
internally converted to extended-pointer capabilities.
This article presents a proof of concept implementation
of a distributed object store (Redis) with CEP. It also
demonstrates a capability-enhanced file system (FUSE)
implementation using CEP. Our proof of concept shows
that CEP provides fine-grain protection while enabling
direct memory access from application clients to the
NVM, and that by doing so opens up important
performance optimization opportunities (up to 4$ \times
$ reduction in latency in comparison to software-based
security enforcement) without compromising security.
Finally, we also sketch how a future hybrid model could
improve the initial implementation by delegating some
CEP functionality to a CHERI-enabled processor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jaleel:2019:DHP,
author = "Aamer Jaleel and Eiman Ebrahimi and Sam Duncan",
title = "{DUCATI}: High-performance Address Translation by
Extending {TLB} Reach of {GPU}-accelerated Systems",
journal = j-TACO,
volume = "16",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3309710",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 11 19:00:20 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Conventional on-chip TLB hierarchies are unable to
fully cover the growing application working-set sizes.
To make things worse, Last-Level TLB (LLT) misses
require multiple accesses to the page table even with
the use of page walk caches. Consequently, LLT misses
incur long address translation latency and hurt
performance. This article proposes two low-overhead
hardware mechanisms for reducing the frequency and
penalty of on-die LLT misses. The first, Unified CAche
and TLB (UCAT), enables the conventional on-die
Last-Level Cache to store cache lines and TLB entries
in a single unified structure and increases on-die TLB
capacity significantly. The second, DRAM-TLB, memoizes
virtual to physical address translations in DRAM and
reduces LLT miss penalty when UCAT is unable to fully
cover total application working-set. DRAM-TLB serves as
the next larger level in the TLB hierarchy that
significantly increases TLB coverage relative to
on-chip TLBs. The combination of these two mechanisms,
DUCATI, is an address translation architecture that
improves GPU performance by 81\%; (up to 4.5$ \times $)
while requiring minimal changes to the existing system
design. We show that DUCATI is within 20\%, 5\%, and
2\% the performance of a perfect LLT system when using
4KB, 64KB, and 2MB pages, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2019:SSD,
author = "Yemao Xu and Dezun Dong and Weixia Xu and Xiangke
Liao",
title = "{SketchDLC}: a Sketch on Distributed Deep Learning
Communication via Trace Capturing",
journal = j-TACO,
volume = "16",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3312570",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the fast development of deep learning (DL), the
communication is increasingly a bottleneck for
distributed workloads, and a series of optimization
works have been done to scale out successfully.
Nevertheless, the network behavior has not been
investigated much yet. We intend to analyze the network
behavior and then carry out some research through
network simulation. Under this circumstance, an
accurate communication measurement is necessary, as it
is an effective way to study the network behavior and
the basis for accurate simulation. Therefore, we
propose to capture the deep learning communication
(DLC) trace to achieve the measurement. To the best of
our knowledge, we make the first attempt to capture the
communication trace for DL training. In this article,
we first provide detailed analyses about the
communication mechanism of MXNet, which is a
representative framework for distributed DL. Secondly,
we define the DLC trace format to describe and record
the communication behaviors. Third, we present the
implementation of method for trace capturing. Finally,
we make some statistics and analyses about the
distributed DL training, including communication
pattern, overlap ratio between computation and
communication, computation overhead, synchronization
overhead, update overhead, and so forth. Both the
statistics and analyses are based on the trace files
captured in a cluster with six machines. On the one
hand, our trace files provide a sketch on the DLC,
which contributes to understanding the communication
details. On the other hand, the captured trace files
can be used for figuring out various overheads, as they
record the communication behaviors of each node.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mastoras:2019:ESE,
author = "Aristeidis Mastoras and Thomas R. Gross",
title = "Efficient and Scalable Execution of Fine-Grained
Dynamic Linear Pipelines",
journal = j-TACO,
volume = "16",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3307411",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present Pipelite, a dynamic scheduler that exploits
the properties of dynamic linear pipelines to achieve
high performance for fine-grained workloads. The
flexibility of Pipelite allows the stages and their
data dependences to be determined at runtime. Pipelite
unifies communication, scheduling, and synchronization
algorithms with suitable data structures. This unified
design introduces the local suspension mechanism and a
wait-free enqueue operation, which allow efficient
dynamic scheduling. The evaluation on a 44-core
machine, using programs from three widely used
benchmark suites, shows that Pipelite implies low
overhead and significantly outperforms the state of the
art in terms of speedup, scalability, and memory
usage.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ham:2019:EDS,
author = "Tae Jun Ham and Juan L. Arag{\'o}n and Margaret
Martonosi",
title = "Efficient Data Supply for Parallel Heterogeneous
Architectures",
journal = j-TACO,
volume = "16",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3310332",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Decoupling techniques have been proposed to reduce the
amount of memory latency exposed to high-performance
accelerators as they fetch data. Although decoupled
access-execute (DAE) and more recent decoupled data
supply approaches offer promising single-threaded
performance improvements, little work has considered
how to extend them into parallel scenarios. This
article explores the opportunities and challenges of
designing parallel, high-performance,
resource-efficient decoupled data supply systems. We
propose Mercury, a parallel decoupled data supply
system that utilizes thread-level parallelism for
high-throughput data supply with good portability
attributes. Additionally, we introduce some
microarchitectural improvements for data supply units
to efficiently handle long-latency indirect loads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sioutas:2019:SSH,
author = "Savvas Sioutas and Sander Stuijk and Luc Waeijen and
Twan Basten and Henk Corporaal and Lou Somers",
title = "Schedule Synthesis for {Halide} Pipelines through
Reuse Analysis",
journal = j-TACO,
volume = "16",
number = "2",
pages = "10:1--10:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3310248",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Efficient code generation for image processing
applications continues to pose a challenge in a domain
where high performance is often necessary to meet
real-time constraints. The inherently complex structure
found in most image-processing pipelines, the plethora
of transformations that can be applied to optimize the
performance of an implementation, as well as the
interaction of these optimizations with locality,
redundant computation and parallelism, can be
identified as the key reasons behind this issue. Recent
domain-specific languages (DSL) such as the Halide DSL
and compiler attempt to encourage high-level
design-space exploration to facilitate the optimization
process. We propose a novel optimization strategy that
aims to maximize producer-consumer locality by
exploiting reuse in image-processing pipelines. We
implement our analysis as a tool that can be used
alongside the Halide DSL to automatically generate
schedules for pipelines implemented in Halide and test
it on a variety of benchmarks. Experimental results on
three different multi-core architectures show an
average performance improvement of 40\% over the Halide
Auto-Scheduler and 75\% over a state-of-the art
approach that targets the PolyMage DSL.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2019:SSL,
author = "Xiaoyuan Wang and Haikun Liu and Xiaofei Liao and Ji
Chen and Hai Jin and Yu Zhang and Long Zheng and
Bingsheng He and Song Jiang",
title = "Supporting Superpages and Lightweight Page Migration
in Hybrid Memory Systems",
journal = j-TACO,
volume = "16",
number = "2",
pages = "11:1--11:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3310133",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Superpages have long been used to mitigate address
translation overhead in large-memory systems. However,
superpages often preclude lightweight page migration,
which is crucial for performance and energy efficiency
in hybrid memory systems composed of DRAM and
non-volatile memory (NVM). In this article, we propose
a novel memory management mechanism called Rainbow to
bridge this fundamental conflict between superpages and
lightweight page migration. Rainbow manages NVM at the
superpage granularity, and uses DRAM to cache
frequently accessed (hot) small pages within each
superpage. Correspondingly, Rainbow utilizes split TLBs
to support different page sizes. By introducing an
efficient hot page identification mechanism and a novel
NVM-to-DRAM address remapping mechanism, Rainbow
supports lightweight page migration without splintering
superpages. Experiment results show that Rainbow can
significantly reduce applications' TLB misses by
99.9\%, and improve application performance (in terms
of IPC) by up to $ 2.9 \times $ (45.3\% on average)
when compared to a state-of-the-art memory migration
policy without a superpage support.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sargaran:2019:SSA,
author = "Sahar Sargaran and Naser Mohammadzadeh",
title = "{SAQIP}: a Scalable Architecture for Quantum
Information Processors",
journal = j-TACO,
volume = "16",
number = "2",
pages = "12:1--12:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311879",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Proposing an architecture that efficiently compensates
for the inefficiencies of physical hardware with extra
resources is one of the key issues in quantum computer
design. Although the demonstration of quantum systems
has been limited to some dozen qubits, scaling the
current small-sized lab quantum systems to large-scale
quantum systems that are capable of solving meaningful
practical problems can be the main goal of much
research. Focusing on this issue, in this article a
scalable architecture for quantum information
processors, called SAQIP, is proposed. Moreover, a flow
is presented to map and schedule a quantum circuit on
this architecture. Experimental results show that the
proposed architecture and design flow decrease the
average latency and the average area of quantum
circuits by about 81\% and 11\%, respectively, for the
attempted benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Budhkar:2019:AMD,
author = "Prerna Budhkar and Ildar Absalyamov and Vasileios Zois
and Skyler Windh and Walid A. Najjar and Vassilis J.
Tsotras",
title = "Accelerating In-Memory Database Selections Using
Latency Masking Hardware Threads",
journal = j-TACO,
volume = "16",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3310229",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Inexpensive DRAMs have created new opportunities for
in-memory data analytics. However, the major bottleneck
in such systems is high memory access latency.
Traditionally, this problem is solved with large cache
hierarchies that only benefit regular applications.
Alternatively, many data-intensive applications exhibit
irregular behavior. Hardware multithreading can better
cope with high latency seen in such applications. This
article implements a multithreaded prototype (MTP) on
FPGAs for the relational selection operator that
exhibits control flow irregularity. On a standard TPC-H
query evaluation, MTP achieves a bandwidth utilization
of 83\%, while the CPU and the GPU implementations
achieve 61\% and 64\%, respectively. Besides being
bandwidth efficient, MTP is also $ 14.2 \times $ and $
4.2 \times $ more power efficient than CPU and GPU,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Riebler:2019:TAH,
author = "Heinrich Riebler and Gavin Vaz and Tobias Kenter and
Christian Plessl",
title = "Transparent Acceleration for Heterogeneous Platforms
With Compilation to {OpenCL}",
journal = j-TACO,
volume = "16",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3319423",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multi-accelerator platforms combine CPUs and different
accelerator architectures within a single compute node.
Such systems are capable of processing parallel
workloads very efficiently while being more energy
efficient than regular systems consisting of CPUs only.
However, the architectures of such systems are diverse,
forcing developers to port applications to each
accelerator using different programming languages,
models, tools, and compilers. Developers not only
require domain-specific knowledge but also need to
understand the low-level accelerator details, leading
to an increase in the design effort and costs. To
tackle this challenge, we propose a compilation
approach and a practical realization called HTrOP that
is completely transparent to the user. HTrOP is able to
automatically analyze a sequential CPU application,
detect computational hotspots, and generate parallel
OpenCL host and kernel code. The potential of HTrOP is
demonstrated by offloading hotspots to different
OpenCL-enabled resources (currently the CPU, the
general-purpose GPU, and the manycore Intel Xeon Phi)
for a broad set of benchmark applications. We present
an in-depth evaluation of our approach in terms of
performance gains and energy savings, taking into
account all static and dynamic overheads. We are able
to achieve speedups and energy savings of up to two
orders of magnitude, if an application has sufficient
computational intensity, when compared to a natively
compiled application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gong:2019:HAG,
author = "Xun Gong and Xiang Gong and Leiming Yu and David
Kaeli",
title = "{HAWS}: Accelerating {GPU} Wavefront Execution through
Selective Out-of-order Execution",
journal = j-TACO,
volume = "16",
number = "2",
pages = "15:1--15:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291050",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphics Processing Units (GPUs) have become an
attractive platform for accelerating challenging
applications on a range of platforms, from High
Performance Computing (HPC) to full-featured
smartphones. They can overcome computational barriers
in a wide range of data-parallel kernels. GPUs hide
pipeline stalls and memory latency by utilizing
efficient thread preemption. But given the demands on
the memory hierarchy due to the growth in the number of
computing cores on-chip, it has become increasingly
difficult to hide all of these stalls. In this article,
we propose a novel Hint-Assisted Wavefront Scheduler
(HAWS) to bypass long-latency stalls. HAWS starts by
enhancing a compiler infrastructure to identify
potential opportunities that can bypass memory stalls.
HAWS includes a wavefront scheduler that can continue
to execute instructions in the shadow of a memory
stall, executing instructions speculatively, guided by
compiler-generated hints. HAWS increases utilization of
GPU resources by aggressively fetching/executing
speculatively. Based on our simulation results on the
AMD Southern Islands GPU architecture, at an estimated
cost of 0.4\% total chip area, HAWS can improve
application performance by 14.6\% on average for memory
intensive applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Song:2019:SAR,
author = "Yang Song and Olivier Alavoine and Bill Lin",
title = "A Self-aware Resource Management Framework for
Heterogeneous Multicore {SoCs} with Diverse {QoS}
Targets",
journal = j-TACO,
volume = "16",
number = "2",
pages = "16:1--16:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3319804",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In modern heterogeneous MPSoCs, the management of
shared memory resources is crucial in delivering
end-to-end QoS. Previous frameworks have either focused
on singular QoS targets or the allocation of
partitionable resources among CPU applications at
relatively slow timescales. However, heterogeneous
MPSoCs typically require instant response from the
memory system where most resources cannot be
partitioned. Moreover, the health of different cores in
a heterogeneous MPSoC is often measured by diverse
performance objectives. In this work, we propose the
Self-Aware Resource Allocation framework for
heterogeneous MPSoCs. Priority-based adaptation allows
cores to use different target performance and
self-monitor their own intrinsic health. In response,
the system allocates non-partitionable resources based
on priorities. The proposed framework meets a diverse
range of QoS demands from heterogeneous cores.
Moreover, we present a runtime scheme to configure
priority-based adaptation so that distinct
sensitivities of heterogeneous QoS targets with respect
to memory allocation can be accommodated. In addition,
the priority of best-effort cores can also be
regulated.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yebenes:2019:CSA,
author = "Pedro Yebenes and Jose Rocher-Gonzalez and Jesus
Escudero-Sahuquillo and Pedro Javier Garcia and
Francisco J. Alfaro and Francisco J. Quiles and
Crisp{\'\i}n G{\'o}mez and Jose Duato",
title = "Combining Source-adaptive and Oblivious Routing with
Congestion Control in High-performance Interconnects
using Hybrid and Direct Topologies",
journal = j-TACO,
volume = "16",
number = "2",
pages = "17:1--17:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3319805",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hybrid and direct topologies are cost-efficient and
scalable options to interconnect thousands of end nodes
in high-performance computing (HPC) systems. They offer
a rich path diversity, high bisection bandwidth, and a
reduced diameter guaranteeing low latency. In these
topologies, efficient deterministic routing algorithms
can be used to balance smartly the traffic flows among
the available routes. Unfortunately, congestion leads
these networks to saturation, where the HoL blocking
effect degrades their performance dramatically. Among
the proposed solutions to deal with HoL blocking, the
routing algorithms selecting alternative routes, such
as adaptive and oblivious, can mitigate the congestion
effects. Other techniques use queues to separate
congested flows from non-congested ones, thus reducing
the HoL blocking. In this article, we propose a new
approach that reduces HoL blocking in hybrid and direct
topologies using source-adaptive and oblivious routing.
This approach also guarantees deadlock-freedom as it
uses virtual networks to break potential cycles
generated by the routing policy in the topology.
Specifically, we propose two techniques, called
Source-Adaptive Solution for Head-of-Line Blocking
Avoidance (SASHA) and Oblivious Solution for
Head-of-Line Blocking Avoidance (OSHA). Experiment
results, carried out through simulations under
different traffic scenarios, show that SASHA and OSHA
can significantly reduce the HoL blocking.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Alshboul:2019:ECR,
author = "Mohammad Alshboul and Hussein Elnawawy and Reem
Elkhouly and Keiji Kimura and James Tuck and Yan
Solihin",
title = "Efficient Checkpointing with Recompute Scheme for
Non-volatile Main Memory",
journal = j-TACO,
volume = "16",
number = "2",
pages = "18:1--18:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3323091",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Future main memory will likely include Non-Volatile
Memory. Non-Volatile Main Memory (NVMM) provides an
opportunity to rethink checkpointing strategies for
providing failure safety to applications. While there
are many checkpointing and logging schemes in the
literature, their use must be revisited as they incur
high execution time overheads as well as a large number
of additional writes to NVMM, which may significantly
impact write endurance. In this article, we propose a
novel recompute-based failure safety approach and
demonstrate its applicability to loop-based code.
Rather than keeping a fully consistent logging state,
we only log enough state to enable recomputation. Upon
a failure, our approach recovers to a consistent state
by determining which parts of the computation were not
completed and recomputing them. Effectively, our
approach removes the need to keep checkpoints or logs,
thus reducing execution time overheads and improving
NVMM write endurance at the expense of more complex
recovery. We compare our new approach against logging
and checkpointing on five scientific workloads,
including tiled matrix multiplication, on a computer
system model that was built on gem5 and supports Intel
PMEM instruction extensions. For tiled matrix
multiplication, our recompute approach incurs an
execution time overhead of only 5\%, in contrast to 8\%
overhead with logging and 207\% overhead with
checkpointing. Furthermore, recompute only adds 7\%
additional NVMM writes, compared to 111\% with logging
and 330\% with checkpointing. We also conduct
experiments on real hardware, allowing us to run our
workloads to completion while varying the number of
threads used for computation. These experiments
substantiate our simulation-based observations and
provide a sensitivity study and performance comparison
between the Recompute Scheme and Naive Checkpointing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hadjilambrou:2019:CCO,
author = "Zacharias Hadjilambrou and Marios Kleanthous and
Georgia Antoniou and Antoni Portero and Yiannakis
Sazeides",
title = "Comprehensive Characterization of an Open Source
Document Search Engine",
journal = j-TACO,
volume = "16",
number = "2",
pages = "19:1--19:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3320346",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This work performs a thorough characterization and
analysis of the open source Lucene search library. The
article describes in detail the architecture,
functionality, and micro-architectural behavior of the
search engine, and investigates prominent online
document search research issues. In particular, we
study how intra-server index partitioning affects the
response time and throughput, explore the potential use
of low power servers for document search, and examine
the sources of performance degradation ands the causes
of tail latencies. Some of our main conclusions are the
following: (a) intra-server index partitioning can
reduce tail latencies but with diminishing benefits as
incoming query traffic increases, (b) low power servers
given enough partitioning can provide same average and
tail response times as conventional high performance
servers, (c) index search is a CPU-intensive
cache-friendly application, and (d) C-states are the
main culprits for performance degradation in document
search.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2019:EGC,
author = "Bingchao Li and Jizeng Wei and Jizhou Sun and Murali
Annavaram and Nam Sung Kim",
title = "An Efficient {GPU} Cache Architecture for Applications
with Irregular Memory Access Patterns",
journal = j-TACO,
volume = "16",
number = "3",
pages = "20:1--20:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3322127",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPUs provide high-bandwidth/low-latency on-chip shared
memory and L1 cache to efficiently service a large
number of concurrent memory requests. Specifically,
concurrent memory requests accessing contiguous memory
space are coalesced into warp-wide accesses. To support
such large accesses to L1 cache with low latency, the
size of L1 cache line is no smaller than that of
warp-wide accesses. However, such L1 cache architecture
cannot always be efficiently utilized when applications
generate many memory requests with irregular access
patterns especially due to branch and memory
divergences that make requests uncoalesced and small.
Furthermore, unlike L1 cache, the shared memory of GPUs
is not often used in many applications, which
essentially depends on programmers. In this article, we
propose Elastic-Cache, which can efficiently support
both fine- and coarse-grained L1 cache line management
for applications with both regular and irregular memory
access patterns to improve the L1 cache efficiency.
Specifically, it can store 32- or 64-byte words in
non-contiguous memory space to a single 128-byte cache
line. Furthermore, it neither requires an extra memory
structure nor reduces the capacity of L1 cache for tag
storage, since it stores auxiliary tags for
fine-grained L1 cache line managements in the shared
memory space that is not fully used in many
applications. To improve the bandwidth utilization of
L1 cache with Elastic-Cache for fine-grained accesses,
we further propose Elastic-Plus to issue 32-byte memory
requests in parallel, which can reduce the processing
latency of memory instructions and improve the
throughput of GPUs. Our experiment result shows that
Elastic-Cache improves the geometric-mean performance
of applications with irregular memory access patterns
by 104\% without degrading the performance of
applications with regular memory access patterns.
Elastic-Plus outperforms Elastic-Cache and improves the
performance of applications with irregular memory
access patterns by 131\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Roberts:2019:POS,
author = "Stephen I. Roberts and Steven A. Wright and Suhaib A.
Fahmy and Stephen A. Jarvis",
title = "The Power-optimised Software Envelope",
journal = j-TACO,
volume = "16",
number = "3",
pages = "21:1--21:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321551",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Advances in processor design have delivered
performance improvements for decades. As physical
limits are reached, refinements to the same basic
technologies are beginning to yield diminishing
returns. Unsustainable increases in energy consumption
are forcing hardware manufacturers to prioritise energy
efficiency in their designs. Research suggests that
software modifications may be needed to exploit the
resulting improvements in current and future hardware.
New tools are required to capitalise on this new class
of optimisation. In this article, we present the Power
Optimised Software Envelope (POSE) model, which allows
developers to assess the potential benefits of power
optimisation for their applications. The POSE model is
metric agnostic and in this article, we provide
derivations using the established Energy-Delay Product
metric and the novel Energy-Delay Sum and Energy-Delay
Distance metrics that we believe are more appropriate
for energy-aware optimisation efforts. We demonstrate
POSE on three platforms by studying the optimisation
characteristics of applications from the Mantevo
benchmark suite. Our results show that the Pathfinder
application has very little scope for power
optimisation while TeaLeaf has the most, with all other
applications in the benchmark suite falling between the
two. Finally, we extend our POSE model with a
formulation known as System Summary POSE-a
meta-heuristic that allows developers to assess the
scope a system has for energy-aware software
optimisation independent of the code being run.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kannan:2019:CIE,
author = "Ram Srivatsa Kannan and Michael Laurenzano and
Jeongseob Ahn and Jason Mars and Lingjia Tang",
title = "{Caliper}: Interference Estimator for Multi-tenant
Environments Sharing Architectural Resources",
journal = j-TACO,
volume = "16",
number = "3",
pages = "22:1--22:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3323090",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We introduce Caliper, a technique for accurately
estimating performance interference occurring in shared
servers. Caliper overcomes the limitations of prior
approaches by leveraging a micro-experiment-based
technique. In contrast to state-of-the-art approaches
that focus on periodically pausing co-running
applications to estimate slowdown, Caliper utilizes a
strategic phase-triggered technique to capture
interference due to co-location. This enables Caliper
to orchestrate an accurate and low-overhead
interference estimation technique that can be readily
deployed in existing production systems. We evaluate
Caliper for a broad spectrum of workload scenarios,
demonstrating its ability to seamlessly support up to
16 applications running simultaneously and outperform
the state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2019:CCC,
author = "Zhen Lin and Hongwen Dai and Michael Mantor and
Huiyang Zhou",
title = "Coordinated {CTA} Combination and Bandwidth
Partitioning for {GPU} Concurrent Kernel Execution",
journal = j-TACO,
volume = "16",
number = "3",
pages = "23:1--23:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3326124",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Contemporary GPUs support multiple kernels to run
concurrently on the same streaming multiprocessors
(SMs). Recent studies have demonstrated that such
concurrent kernel execution (CKE) improves both
resource utilization and computational throughput. Most
of the prior works focus on partitioning the GPU
resources at the cooperative thread array (CTA) level
or the warp scheduler level to improve CKE. However,
significant performance slowdown and unfairness are
observed when latency-sensitive kernels co-run with
bandwidth-intensive ones. The reason is that bandwidth
over-subscription from bandwidth-intensive kernels
leads to much aggravated memory access latency, which
is highly detrimental to latency-sensitive kernels.
Even among bandwidth-intensive kernels, more intensive
kernels may unfairly consume much higher bandwidth than
less-intensive ones. In this article, we first make a
case that such problems cannot be sufficiently solved
by managing CTA combinations alone and reveal the
fundamental reasons. Then, we propose a coordinated
approach for CTA combination and bandwidth
partitioning. Our approach dynamically detects
co-running kernels as latency sensitive or bandwidth
intensive. As both the DRAM bandwidth and L2-to-L1
Network-on-Chip (NoC) bandwidth can be the critical
resource, our approach partitions both bandwidth
resources coordinately along with selecting proper CTA
combinations. The key objective is to allocate more CTA
resources for latency-sensitive kernels and more
NoC/DRAM bandwidth resources to NoC-/DRAM-intensive
kernels. We achieve it using a variation of dominant
resource fairness (DRF). Compared with two
state-of-the-art CKE optimization schemes, SMK [52] and
WS [55], our approach improves the average harmonic
speedup by 78\% and 39\%, respectively. Even compared
to the best possible CTA combinations, which are
obtained from an exhaustive search among all possible
CTA combinations, our approach improves the harmonic
speedup by up to 51\% and 11\% on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Didier:2019:CCP,
author = "Keryan Didier and Dumitru Potop-Butucaru and Guillaume
Iooss and Albert Cohen and Jean Souyris and Philippe
Baufreton and Amaury Graillat",
title = "Correct-by-Construction Parallelization of Hard
Real-Time Avionics Applications on Off-the-Shelf
Predictable Hardware",
journal = j-TACO,
volume = "16",
number = "3",
pages = "24:1--24:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328799",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present the first end-to-end modeling and
compilation flow to parallelize hard real-time control
applications while fully guaranteeing the respect of
real-time requirements on off-the-shelf hardware. It
scales to thousands of dataflow nodes and has been
validated on two production avionics applications.
Unlike classical optimizing compilation, it takes as
input non-functional requirements (real time, resource
limits). To enforce these requirements, the compiler
follows a static resource allocation strategy, from
coarse-grain tasks communicating over an
interconnection network all the way to individual
variables and memory accesses. It controls timing
interferences resulting from mapping decisions in a
precise, safe, and scalable way.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zardoshti:2019:STM,
author = "Pantea Zardoshti and Tingzhe Zhou and Pavithra Balaji
and Michael L. Scott and Michael Spear",
title = "Simplifying Transactional Memory Support in {C++}",
journal = j-TACO,
volume = "16",
number = "3",
pages = "25:1--25:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328796",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "C++ has supported a provisional version of
Transactional Memory (TM) since 2015, via a technical
specification. However, TM has not seen widespread
adoption, and compiler vendors have been slow to
implement the technical specification. We conjecture
that the proposed TM support is too difficult for
programmers to use, too complex for compiler designers
to implement and verify, and not industry-proven enough
to justify final standardization in its current form.
To address these problems, we present a different
design for supporting TM in C++. By forbidding explicit
self-abort, and by introducing an executor-based
mechanism for running transactions, our approach makes
it easier for developers to get code up and running
with TM. Our proposal should also be appealing to
compiler developers, as it allows a spectrum of levels
of support for TM, with varying performance, and
varying reliance on hardware TM support in order to
provide scalability. \<?tight?\>While our design
does not enable some of the optimizations admitted by
the current technical specification, we show that it
enables the implementation of robust support for TM in
a small, orthogonal compiler extension. Our
implementation is able to handle a wide range of
transactional programs, delivering low instrumentation
overhead and scalability and performance on par with
the current state of the art. Based on this experience,
we believe our approach to be a viable means of
reinvigorating the standardization of TM in C++.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Park:2019:MCM,
author = "Jungwoo Park and Myoungjun Lee and Soontae Kim and
Minho Ju and Jeongkyu Hong",
title = "{MH} Cache: a Multi-retention {STT-RAM}-based
Low-power Last-level Cache for Mobile Hardware
Rendering Systems",
journal = j-TACO,
volume = "16",
number = "3",
pages = "26:1--26:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328520",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Mobile devices have become the most important devices
in our life. However, they are limited in battery
capacity. Therefore, low-power computing is crucial for
their long lifetime. A spin-transfer torque RAM
(STT-RAM) has become emerging memory technology because
of its low leakage power consumption. We herein propose
MH cache, a multi-retention STT-RAM-based cache
management scheme for last-level caches (LLC) to reduce
their power consumption for mobile hardware rendering
systems. We analyzed the memory access patterns of
processes and observed how rendering methods affect
process behaviors. We propose a cache management scheme
that measures write-intensity of each process
dynamically and exploits it to manage a power-efficient
multi-retention STT-RAM-based cache. Our proposed
scheme uses variable threshold for a process'
write-intensity to determine cache line placement. We
explain how to deal with the following issue to
implement our proposed scheme. Our experimental results
show that our techniques significantly reduce the LLC
power consumption by 32\% and 32.2\% in single- and
quad-core systems, respectively, compared to a full
STT-RAM LLC.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Leben:2019:PCM,
author = "Jakob Leben and George Tzanetakis",
title = "Polyhedral Compilation for Multi-dimensional Stream
Processing",
journal = j-TACO,
volume = "16",
number = "3",
pages = "27:1--27:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3330999",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present a method for compilation of
multi-dimensional stream processing programs from
affine recurrence equations with unbounded domains into
imperative code with statically allocated memory. The
method involves a novel polyhedral schedule
transformation called periodic tiling. It accommodates
existing polyhedral optimizations to improve memory
access patterns and expose parallelism. This enables
efficient execution of programming languages with
unbounded recurrence equations, as well as optimization
of existing languages from which this form can be
derived. The method is experimentally evaluated on 5
DSP algorithms with large problem sizes. Results show
potential for improved throughput compared to
hand-optimized C++ (speedups on a 6-core Intel Xeon CPU
up to $ 10 \times $ with a geometric mean $ 3.3 \times
$).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sadeghi:2019:TCN,
author = "Mohammad Sadegh Sadeghi and Siavash Bayat Sarmadi and
Shaahin Hessabi",
title = "Toward On-chip Network Security Using Runtime
Isolation Mapping",
journal = j-TACO,
volume = "16",
number = "3",
pages = "28:1--28:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337770",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many-cores execute a large number of diverse
applications concurrently. Inter-application
interference can lead to a security threat as timing
channel attack in the on-chip network. A
non-interference communication in the shared on-chip
network is a dominant necessity for secure many-core
platforms to leverage the concepts of the cloud and
embedded system-on-chip. The current non-interference
techniques are limited to static scheduling and need
router modification at micro-architecture level.
Mapping of applications can effectively determine the
interference among applications in on-chip network. In
this work, we explore non-interference approaches
through run-time mapping at software and application
level. We map the same group of applications in
isolated domain(s) to meet non-interference flows.
Through run-time mapping, we can maximize utilization
of the system without leaking information. The proposed
run-time mapping policy requires no router modification
in contrast to the best known competing schemes, and
the performance degradation is, on average, 16\%
compared to the state-of-the-art baselines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Louise:2019:FST,
author = "Stephane Louise",
title = "A First Step Toward Using Quantum Computing for
Low-level {WCETs} Estimations",
journal = j-TACO,
volume = "16",
number = "3",
pages = "29:1--29:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3335549",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Low-Level analysis of Worst Case Execution Time (WCET)
is an important field for real-time system validation.
It stands between computer architecture and
mathematics, as it relies strongly on variants of
abstract interpretation. One of the features that
causes the largest uncertainty regarding WCET
evaluation for low-level analysis of sequential
execution on a single processor is taking Cache
Memory-related Delays (CMRD) and Cache-related
Preemption Delays (CRPD) correctly into account.
Research work from the 1990s provides a good basic
framework for this problem as long as a task runs
without preemption. But when preemption of tasks is
allowed, although several formalisms exist, their
predictive power is lower and the usual approach relies
on analyses of NP-hard problems. In this article, we
want to show some potential advantages of using a
formalism inspired by Quantum Computing (QC) to
evaluate CMRDs with preemptions while avoiding the
NP-hard problem underneath. The experimental results,
with a classic (non-quantum) numerical approach, on a
selection of Malardalen benchmark programs display very
good accuracy, while the complexity of the evaluation
is a low-order polynomial of the number of memory
accesses. While it is not yet a fully parallel quantum
algorithm, we provide a first roadmap on how to reach
such an objective.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chikin:2019:MAA,
author = "Artem Chikin and Taylor Lloyd and Jos{\'e} Nelson
Amaral and Ettore Tiotto and Muhammad Usman",
title = "Memory-access-aware Safety and Profitability Analysis
for Transformation of Accelerator-bound {OpenMP}
Loops",
journal = j-TACO,
volume = "16",
number = "3",
pages = "30:1--30:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3333060",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Iteration Point Difference Analysis is a new static
analysis framework that can be used to determine the
memory coalescing characteristics of parallel loops
that target GPU offloading and to ascertain safety and
profitability of loop transformations with the goal of
improving their memory access characteristics. This
analysis can propagate definitions through control
flow, works for non-affine expressions, and is capable
of analyzing expressions that reference conditionally
defined values. This analysis framework enables safe
and profitable loop transformations. Experimental
results demonstrate potential for dramatic performance
improvements. GPU kernel execution time across the
Polybench suite is improved by up to $ 25.5 \times $ on
an Nvidia P100 with benchmark overall improvement of up
to $ 3.2 \times $. An opportunity detected in a SPEC
ACCEL benchmark yields kernel speedup of $ 86.5 \times
$ with a benchmark improvement of $ 3.3 \times $. This
work also demonstrates how architecture-aware compilers
improve code portability and reduce programmer
effort.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cha:2019:MDC,
author = "Sanghoon Cha and Bokyeong Kim and Chang Hyun Park and
Jaehyuk Huh",
title = "Morphable {DRAM} Cache Design for Hybrid Memory
Systems",
journal = j-TACO,
volume = "16",
number = "3",
pages = "31:1--31:??",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3338505",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "DRAM caches have emerged as an efficient new layer in
the memory hierarchy to address the increasing
diversity of memory components. When a small amount of
fast memory is combined with slow but large memory, the
cache-based organization of the fast memory can provide
a SW-transparent solution for the hybrid memory
systems. In such DRAM cache designs, their
effectiveness is affected by the bandwidth and latency
of both fast and slow memory. To quantitatively assess
the effect of memory configurations and application
patterns on the DRAM cache designs, this article first
investigates how three prior approaches perform with
six hybrid memory scenarios. From the investigation, we
observe no single DRAM cache organization always
outperforms the other organizations across the diverse
hybrid memory configurations and memory access
patterns. Based on this observation, this article
proposes a reconfigurable DRAM cache design that can
adapt to different hybrid memory combinations and
workload patterns. Unlike the fixed tag and data arrays
of conventional on-chip SRAM caches, this study
advocates to exploit the flexibility of DRAM caches,
which can store tags and data to DRAM in any arbitrary
way. Using a sample-based mechanism, the proposed DRAM
cache controller dynamically finds the best
organization from three candidates and applies the best
one by reconfiguring the tags and data layout in the
DRAM cache. Our evaluation shows that the proposed
morphable DRAM cache can outperform the fixed DRAM
configurations across six hybrid memory
configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2019:SCT,
author = "Chao Luo and Yunsi Fei and David Kaeli",
title = "Side-channel Timing Attack of {RSA} on a {GPU}",
journal = j-TACO,
volume = "16",
number = "3",
pages = "32:1--32:??",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3341729",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:27:40 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3341729",
abstract = "To increase computation throughput, general purpose
Graphics Processing Units (GPUs) have been leveraged to
accelerate computationally intensive workloads. GPUs
have been used as cryptographic engines, improving
encryption/decryption throughput and leveraging the
GPU's Single Instruction Multiple Thread (SIMT) model.
RSA is a widely used public-key cipher and has been
ported onto GPUs for signing and decrypting large
files. Although performance has been significantly
improved, the security of RSA on GPUs is vulnerable to
side-channel timing attacks and is an exposure
overlooked in previous studies. GPUs tend to be
naturally resilient to side-channel attacks, given that
they execute a large number of concurrent threads,
performing many RSA operations on different data in
parallel. Given the degree of parallel execution on a
GPU, there will be a significant amount of noise
introduced into the timing channel given the thousands
of concurrent threads executing concurrently. In this
work, we build a timing model to capture the parallel
characteristics of an RSA public-key cipher implemented
on a GPU. We consider optimizations that include using
Montgomery multiplication and sliding-window
exponentiation to implement cryptographic operations.
Our timing model considers the challenges of parallel
execution, complications that do not occur in
single-threaded computing platforms. Based on our
timing model, we launch successful timing attacks on
RSA running on a GPU, extracting the private key of
RSA. We also present an effective error detection and
correction mechanism. Our results demonstrate that GPU
acceleration of RSA is vulnerable to side-channel
timing attacks. We propose several countermeasures to
defend against this class of attacks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yuan:2019:RTL,
author = "Liang Yuan and Chen Ding and Wesley Smith and Peter
Denning and Yunquan Zhang",
title = "A Relational Theory of Locality",
journal = j-TACO,
volume = "16",
number = "3",
pages = "33:1--33:??",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3341109",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:27:40 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3341109",
abstract = "In many areas of program and system analysis and
optimization, locality is a common concept and has been
defined and measured in many ways. This article aims to
formally establish relations between these previously
disparate types of locality. It categorizes locality
definitions in three groups and shows whether and how
they can be interconverted. For the footprint, a recent
metric, it gives a new measurement algorithm that is
asymptotically more time/space efficient than previous
approaches. Using the conversion relations, the new
algorithm derives with the same efficiency different
locality metrics developed and used in program
analysis, memory management, and cache design.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Thangamani:2020:ORC,
author = "Arun Thangamani and V. Krishna Nandivada",
title = "Optimizing Remote Communication in {X10}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "34:1--34:26",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3345558",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "X10 is a partitioned global address space programming
language that supports the notion of places; a place
consists of some data and some lightweight tasks called
activities. Each activity runs at a place and may
invoke a place-change operation (using the
at-construct) to synchronously perform some computation
at another place. These place-change operations can be
very expensive, as they need to copy all the required
data from the current place to the remote place.
However, identifying the necessary number of
place-change operations and the required data during
each place-change operation are non-trivial tasks,
especially in the context of irregular applications
(like graph applications) that contain complex code
with large amounts of cross-referencing objects-not all
of those objects may be actually required, at the
remote place. In this article, we present AT-Com, a
scheme to optimize X10 code with place-change
operations. AT-Com consists of two inter-related new
optimizations: (i) AT-Opt, which minimizes the amount
of data serialized and communicated during place-change
operations, and (ii) AT-Pruning, which
identifies/elides redundant place-change operations and
does parallel execution of place-change operations.
AT-Opt uses a novel abstraction, called
abstract-place-tree, to capture place-change operations
in the program. For each place-change operation, AT-Opt
uses a novel inter-procedural analysis to precisely
identify the data required at the remote place in terms
of the variables in the current scope. AT-Opt then
emits the appropriate code to copy the identified
data-items to the remote place. AT-Pruning introduces a
set of program transformation techniques to emit
optimized code such that it avoids the redundant
place-change operations. We have implemented AT-Com in
the x10v2.6.0 compiler and tested it over the IMSuite
benchmark kernels. Compared to the current X10
compiler, the AT-Com optimized code achieved a
geometric mean speedup of 18.72$ \times $ and 17.83$
\times $ on a four-node (32 cores per node) Intel and
two-node (16 cores per node) AMD system,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Srikanth:2020:MAS,
author = "Sriseshan Srikanth and Anirudh Jain and Joseph M.
Lennon and Thomas M. Conte and Erik Debenedictis and
Jeanine Cook",
title = "{MetaStrider}: Architectures for Scalable
Memory-centric Reduction of Sparse Data Streams",
journal = j-TACO,
volume = "16",
number = "4",
pages = "35:1--35:26",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3355396",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Reduction is an operation performed on the values of
two or more key-value pairs that share the same key.
Reduction of sparse data streams finds application in a
wide variety of domains such as data and graph
analytics, cybersecurity, machine learning, and HPC
applications. However, these applications exhibit low
locality of reference, rendering traditional
architectures and data representations inefficient.
This article presents MetaStrider, a significant
algorithmic and architectural enhancement to the
state-of-the-art, SuperStrider. Furthermore, these
enhancements enable a variety of parallel,
memory-centric architectures that we propose, resulting
in demonstrated performance that scales near-linearly
with available memory-level parallelism.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Koraei:2020:DSS,
author = "Mostafa Koraei and Omid Fatemi and Magnus Jahre",
title = "{DCMI}: a Scalable Strategy for Accelerating Iterative
Stencil Loops on {FPGAs}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "36:1--36:24",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3352813",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Iterative Stencil Loops (ISLs) are the key kernel
within a range of compute-intensive applications. To
accelerate ISLs with Field Programmable Gate Arrays, it
is critical to exploit parallelism (1) among elements
within the same iteration and (2) across loop
iterations. We propose a novel ISL acceleration scheme
called Direct Computation of Multiple Iterations (DCMI)
that improves upon prior work by pre-computing the
effective stencil coefficients after a number of
iterations at design time-resulting in accelerators
that use minimal on-chip memory and avoid redundant
computation. This enables DCMI to improve throughput by
up to 7.7$ \times $ compared to the state-of-the-art
cone-based architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Peled:2020:NNP,
author = "Leeor Peled and Uri Weiser and Yoav Etsion",
title = "A Neural Network Prefetcher for Arbitrary Memory
Access Patterns",
journal = j-TACO,
volume = "16",
number = "4",
pages = "37:1--37:27",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3345000",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3345000",
abstract = "Memory prefetchers are designed to identify and
prefetch specific access patterns, including
spatiotemporal locality (e.g., strides, streams),
recurring patterns (e.g., varying strides, temporal
correlation), and specific irregular patterns (e.g.,
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vasilache:2020:NAL,
author = "Nicolas Vasilache and Oleksandr Zinenko and Theodoros
Theodoridis and Priya Goyal and Zachary Devito and
William S. Moses and Sven Verdoolaege and Andrew Adams
and Albert Cohen",
title = "The Next 700 Accelerated Layers: From Mathematical
Expressions of Network Computation Graphs to
Accelerated {GPU} Kernels, Automatically",
journal = j-TACO,
volume = "16",
number = "4",
pages = "38:1--38:26",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3355606",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Deep learning frameworks automate the deployment,
distribution, synchronization, memory allocation, and
hardware acceleration of models represented as graphs
of computational operators. These operators wrap
high-performance libraries such as cuDNN or NNPACK.
When the computation does not match any predefined
library call, custom operators must be implemented,
often at high engineering cost and performance penalty,
limiting the pace of innovation. To address this
productivity gap, we propose and evaluate: (1) a
domain-specific language with a tensor notation close
to the mathematics of deep learning; (2) a Just-In-Time
optimizing compiler based on the polyhedral framework;
(3) carefully coordinated linear optimization and
evolutionary algorithms to synthesize high-performance
CUDA kernels; (4) the transparent integration of our
flow into PyTorch and Caffe2, providing the fully
automatic synthesis of high-performance GPU kernels
from simple tensor algebra. The performance is
comparable to, and often exceeds the performance of,
highly tuned libraries.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2020:LLA,
author = "Wenbin Jiang and Yang Ma and Bo Liu and Haikun Liu and
Bing Bing Zhou and Jian Zhu and Song Wu and Hai Jin",
title = "{Layup}: Layer-adaptive and Multi-type
Intermediate-oriented Memory Optimization for
{GPU}-based {CNNs}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "39:1--39:23",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357238",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although GPUs have emerged as the mainstream for the
acceleration of convolutional neural network (CNN)
training processes, they usually have limited physical
memory, meaning that it is hard to train large-scale
CNN models. Many methods for memory optimization have
been proposed to decrease the memory consumption of
CNNs and to mitigate the increasing scale of these
networks; however, this optimization comes at the cost
of an obvious drop in time performance. We propose a
new memory optimization strategy named Layup that
realizes both better memory efficiency and better time
performance. First, a fast layer-type-specific method
for memory optimization is presented, based on the new
finding that a single memory optimization often shows
dramatic differences in time performance for different
types of layers. Second, a new memory reuse method is
presented in which greater attention is paid to
multi-type intermediate data such as convolutional
workspaces and cuDNN handle data. Experiments show that
Layup can significantly increase the scale of
extra-deep network models on a single GPU with lower
performance loss. It even can train ResNet with 2,504
layers using 12GB memory, outperforming the
state-of-the-art work of SuperNeurons with 1,920 layers
(batch size = 16).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Siso:2020:EAV,
author = "Sergi Siso and Wes Armour and Jeyarajan
Thiyagalingam",
title = "Evaluating Auto-Vectorizing Compilers through
Objective Withdrawal of Useful Information",
journal = j-TACO,
volume = "16",
number = "4",
pages = "40:1--40:23",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3356842",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The need for compilers to generate highly vectorized
code is at an all-time high with the increasing
vectorization capabilities of modern processors. To
this end, the information that compilers have at their
disposal, either through code analysis or via user
annotations, is instrumental for auto-vectorization,
and hence for the overall performance. However, the
information that is available to compilers at compile
time and its accuracy varies greatly, as does the
resulting performance of vectorizing compilers.
Benchmarks like the Test Suite for Vectorizing
Compilers (TSVC) have been developed to evaluate the
vectorization capability of such compilers. The
overarching approach of TSVC and similar benchmarks is
to evaluate the compilers under the best possible
scenario (i.e., assuming that compilers have access to
all useful contextual information at compile time).
Although this idealistic view is useful to observe the
capability of compilers for auto-vectorization, it is
not a true reflection of the conditions found in
real-world applications. In this article, we propose a
novel method for evaluating the auto-vectorization
capability of compilers. Instead of assuming that
compilers have access to a wealth of information at
compile time, we formulate a method to objectively
supply or withdraw information that would otherwise aid
the compiler in the auto-vectorization process. This
method is orthogonal to the approach adopted by TSVC,
and as such, it provides the means of assessing the
capabilities of modern vectorizing compilers in a more
detailed way. Using this new method, we exhaustively
evaluated five industry-grade compilers (GNU, Intel,
Clang, PGI, and IBM) on four representative vector
platforms (AVX-2, AVX-512 (Skylake), AVX-512 (KNL), and
AltiVec) using the modified version of TSVC and
application-level proxy kernels. The results show the
impact that withdrawing information has on the
vectorization capabilities of each compiler and also
prove the validity of the presented technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Resch:2020:PBN,
author = "Salonik Resch and S. Karen Khatamifard and Zamshed
Iqbal Chowdhury and Masoud Zabihi and Zhengyang Zhao
and Jian-Ping Wang and Sachin S. Sapatnekar and Ulya R.
Karpuzcu",
title = "{PIMBALL}: Binary Neural Networks in Spintronic
Memory",
journal = j-TACO,
volume = "16",
number = "4",
pages = "41:1--41:26",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357250",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 12 15:31:26 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Neural networks span a wide range of applications of
industrial and commercial significance. Binary neural
networks (BNN) are particularly effective in trading
accuracy for performance, energy efficiency, or
hardware/software complexity. Here, we introduce a
spintronic, re-configurable in-memory BNN accelerator,
PIMBALL: Processing In Memory BNN AccL(L)erator, which
allows for massively parallel and energy efficient
computation. PIMBALL is capable of being used as a
standard spintronic memory (STT-MRAM) array and a
computational substrate simultaneously. We evaluate
PIMBALL using multiple image classifiers and a genomics
kernel. Our simulation results show that PIMBALL is
more energy efficient than alternative CPU-, GPU-, and
FPGA-based implementations while delivering higher
throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2020:EBC,
author = "Zhen Hang Jiang and Yunsi Fei and David Kaeli",
title = "Exploiting Bank Conflict-based Side-channel Timing
Leakage of {GPUs}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "42:1--42:24",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3361870",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3361870",
abstract = "To prevent information leakage during program
execution, modern software cryptographic
implementations target constant-time function, where
the number of instructions executed remains the same
when program inputs change. However, the underlying
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Daruwalla:2020:BVC,
author = "Kyle Daruwalla and Heng Zhuo and Rohit Shukla and
Mikko Lipasti",
title = "{BitSAD v2}: Compiler Optimization and Analysis for
Bitstream Computing",
journal = j-TACO,
volume = "16",
number = "4",
pages = "43:1--43:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3364999",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3364999",
abstract = "Computer vision and machine learning algorithms
operating under a strict power budget require an
alternate computing paradigm. While bitstream computing
(BC) satisfies these constraints, creating BC systems
is difficult. To address the design challenges,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mastoras:2020:CDL,
author = "Aristeidis Mastoras and Thomas R. Gross",
title = "Chunking for Dynamic Linear Pipelines",
journal = j-TACO,
volume = "16",
number = "4",
pages = "44:1--44:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3363815",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3363815",
abstract = "Dynamic scheduling and dynamic creation of the
pipeline structure are crucial for efficient execution
of pipelined programs. Nevertheless, dynamic systems
imply higher overhead than static systems. Therefore,
chunking is the key to decrease the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Selva:2020:BPR,
author = "Manuel Selva and Fabian Gruber and Diogo Sampaio and
Christophe Guillon and Louis-No{\"e}l Pouchet and
Fabrice Rastello",
title = "Building a Polyhedral Representation from an
Instrumented Execution: Making Dynamic Analyses of
Nonaffine Programs Scalable",
journal = j-TACO,
volume = "16",
number = "4",
pages = "45:1--45:26",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3363785",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3363785",
abstract = "The polyhedral model has been successfully used in
production compilers. Nevertheless, only a very
restricted class of applications can benefit from it.
Recent proposals investigated how runtime information
could be used to apply polyhedral optimization
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yasin:2020:MGM,
author = "Ahmad Yasin and Jawad Haj-Yahya and Yosi Ben-Asher and
Avi Mendelson",
title = "A Metric-Guided Method for Discovering Impactful
Features and Architectural Insights for {Skylake}-Based
Processors",
journal = j-TACO,
volume = "16",
number = "4",
pages = "46:1--46:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3369383",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3369383",
abstract = "The slowdown in technology scaling puts architectural
features at the forefront of the innovation in modern
processors. This article presents a Metric-Guided
Method (MGM) that extends Top-Down analysis with
carefully selected, dynamically adapted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2020:FTF,
author = "Jie Zhao and Albert Cohen",
title = "Flextended Tiles: a Flexible Extension of Overlapped
Tiles for Polyhedral Compilation",
journal = j-TACO,
volume = "16",
number = "4",
pages = "47:1--47:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3369382",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3369382",
abstract = "Loop tiling to exploit data locality and parallelism
plays an essential role in a variety of general-purpose
and domain-specific compilers. Affine transformations
in polyhedral frameworks implement classical forms of
rectangular and parallelogram tiling, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gerzhoy:2020:NMS,
author = "Daniel Gerzhoy and Xiaowu Sun and Michael Zuzak and
Donald Yeung",
title = "Nested {MIMD--SIMD} Parallelization for Heterogeneous
Microprocessors",
journal = j-TACO,
volume = "16",
number = "4",
pages = "48:1--48:27",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3368304",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3368304",
abstract = "Heterogeneous microprocessors integrate a CPU and GPU
on the same chip, providing fast CPU-GPU communication
and enabling cores to compute on data {``in place.''}
This permits exploiting a finer granularity of
parallelism on the integrated GPUs, and enables
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xia:2020:DAB,
author = "Chunwei Xia and Jiacheng Zhao and Huimin Cui and
Xiaobing Feng and Jingling Xue",
title = "{DNNTune}: Automatic Benchmarking {DNN} Models for
Mobile-cloud Computing",
journal = j-TACO,
volume = "16",
number = "4",
pages = "49:1--49:26",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3368305",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/super.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3368305",
abstract = "Deep Neural Networks (DNNs) are now increasingly
adopted in a variety of Artificial Intelligence (AI)
applications. Meantime, more and more DNNs are moving
from cloud to the mobile devices, as emerging AI chips
are integrated into mobiles. Therefore, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Briggs:2020:FRT,
author = "Ian Briggs and Arnab Das and Mark Baranowski and
Vishal Sharma and Sriram Krishnamoorthy and Zvonimir
Rakamari{\'c} and Ganesh Gopalakrishnan",
title = "{FailAmp}: Relativization Transformation for Soft
Error Detection in Structured Address Generation",
journal = j-TACO,
volume = "16",
number = "4",
pages = "50:1--50:21",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3369381",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3369381",
abstract = "We present FailAmp, a novel LLVM program
transformation algorithm that makes programs employing
structured index calculations more robust against soft
errors. Without FailAmp, an offset error can go
undetected; with FailAmp, all subsequent offsets are
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahmad:2020:DDM,
author = "Khalid Ahmad and Hari Sundar and Mary Hall",
title = "Data-driven Mixed Precision Sparse Matrix Vector
Multiplication for {GPUs}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "51:1--51:24",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3371275",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3371275",
abstract = "We optimize Sparse Matrix Vector multiplication (SpMV)
using a mixed precision strategy (MpSpMV) for Nvidia
V100 GPUs. The approach has three benefits: (1) It
reduces computation time, (2) it reduces the size of
the input matrix and therefore reduces data movement,
and (3) it provides an opportunity for increased
parallelism. MpSpMV's decision to lower to single
precision is data driven, based on individual nonzero
values of the sparse matrix. On all real-valued
matrices from the Sparse Matrix Collection, we obtain a
maximum speedup of $ 2.61 \times $ and average speedup
of $ 1.06 \times $ over double precision, while
maintaining higher accuracy compared to single
precision.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stoltzfus:2020:TOS,
author = "Larisa Stoltzfus and Bastian Hagedorn and Michel
Steuwer and Sergei Gorlatch and Christophe Dubach",
title = "Tiling Optimizations for Stencil Computations Using
Rewrite Rules in Lift",
journal = j-TACO,
volume = "16",
number = "4",
pages = "52:1--52:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3368858",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3368858",
abstract = "Stencil computations are a widely used type of
algorithm, found in applications from physical
simulations to machine learning. Stencils are
embarrassingly parallel, therefore fit on modern
hardware such as Graphic Processing Units perfectly.
Although \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{vanderVlag:2020:ECB,
author = "Michiel A. van der Vlag and Georgios Smaragdos and
Zaid Al-Ars and Christos Strydis",
title = "Exploring Complex Brain-Simulation Workloads on
Multi-{GPU} Deployments",
journal = j-TACO,
volume = "16",
number = "4",
pages = "53:1--53:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3371235",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3371235",
abstract = "In-silico brain simulations are the de-facto tools
computational neuroscientists use to understand
large-scale and complex brain-function dynamics.
Current brain simulators do not scale efficiently
enough to large-scale problem sizes (e.g., $ > 100, 000
$ \ldots{})",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Elkhouly:2020:CSC,
author = "Reem Elkhouly and Mohammad Alshboul and Akihiro
Hayashi and Yan Solihin and Keiji Kimura",
title = "Compiler-support for Critical Data Persistence in
{NVM}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "54:1--54:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3371236",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3371236",
abstract = "Non-volatile Main Memories (NVMs) offer a promising
way to preserve data persistence and enable computation
recovery in case of failure. While the use of NVMs can
significantly reduce the overhead of failure recovery,
which is the case with High-\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chelini:2020:DLT,
author = "Lorenzo Chelini and Oleksandr Zinenko and Tobias
Grosser and Henk Corporaal",
title = "Declarative Loop Tactics for Domain-specific
Optimization",
journal = j-TACO,
volume = "16",
number = "4",
pages = "55:1--55:25",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3372266",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3372266",
abstract = "Increasingly complex hardware makes the design of
effective compilers difficult. To reduce this problem,
we introduce Declarative Loop Tactics, which is a novel
framework of composable program transformations based
on an internal tree-like program \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Khan:2020:SMS,
author = "Asif Ali Khan and Fazal Hameed and Robin Bl{\"a}sing
and Stuart S. P. Parkin and Jeronimo Castrillon",
title = "{ShiftsReduce}: Minimizing Shifts in {Racetrack Memory
4.0}",
journal = j-TACO,
volume = "16",
number = "4",
pages = "56:1--56:23",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3372489",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 11 07:11:45 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3372489",
abstract = "Racetrack memories (RMs) have significantly evolved
since their conception in 2008, making them a serious
contender in the field of emerging memory technologies.
Despite key technological advancements, the access
latency and energy consumption of an RM-\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2020:DCP,
author = "Yuhao Li and Dan Sun and Benjamin C. Lee",
title = "Dynamic Colocation Policies with Reinforcement
Learning",
journal = j-TACO,
volume = "17",
number = "1",
pages = "1:1--1:25",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3375714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3375714",
abstract = "We draw on reinforcement learning frameworks to design
and implement an adaptive controller for managing
resource contention. During runtime, the controller
observes the dynamic system conditions and optimizes
control policies that satisfy latency \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tampouratzis:2020:NHI,
author = "Nikolaos Tampouratzis and Ioannis Papaefstathiou and
Antonios Nikitakis and Andreas Brokalakis and Stamatis
Andrianakis and Apostolos Dollas and Marco Marcon and
Emanuele Plebani",
title = "A Novel, Highly Integrated Simulator for Parallel and
Distributed Systems",
journal = j-TACO,
volume = "17",
number = "1",
pages = "2:1--2:28",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3378934",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3378934",
abstract = "In an era of complex networked parallel heterogeneous
systems, simulating independently only parts,
components, or attributes of a system-under-design is a
cumbersome, inaccurate, and inefficient approach.
Moreover, by considering each part of a system
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2020:EHE,
author = "Lijuan Jiang and Chao Yang and Wenjing Ma",
title = "Enabling Highly Efficient Batched Matrix
Multiplications on {SW26010} Many-core Processor",
journal = j-TACO,
volume = "17",
number = "1",
pages = "3:1--3:23",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3378176",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3378176",
abstract = "We present a systematic methodology for optimizing
batched matrix multiplications on SW26010 many-core
processor of the Sunway TaihuLight supercomputer. Five
surrogate algorithms and a machine learning-based
algorithm selector are proposed to fully \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cavus:2020:IPI,
author = "Mustafa Cavus and Resit Sendag and Joshua J. Yi",
title = "Informed Prefetching for Indirect Memory Accesses",
journal = j-TACO,
volume = "17",
number = "1",
pages = "4:1--4:29",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3374216",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3374216",
abstract = "Indirect memory accesses have irregular access
patterns that limit the performance of conventional
software and hardware-based prefetchers. To address
this problem, we propose the Array Tracking Prefetcher
(ATP), which tracks array-based indirect memory
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Uguen:2020:ASA,
author = "Yohann Uguen and Florent {De Dinechin} and Victor
Lezaud and Steven Derrien",
title = "Application-Specific Arithmetic in High-Level
Synthesis Tools",
journal = j-TACO,
volume = "17",
number = "1",
pages = "5:1--5:23",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377403",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377403",
abstract = "This work studies hardware-specific optimization
opportunities currently unexploited by high-level
synthesis compilers. Some of these optimizations are
specializations of floating-point operations that
respect the usual semantics of the input program
without changing the numerical result. Some other
optimizations, locally triggered by the programmer
thanks to a pragma, assume a different semantics, where
floating-point code is interpreted as the specification
of computation with real numbers. The compiler is then
in charge to ensure an application-level accuracy
constraint expressed in the pragma and has the freedom
to use non-standard arithmetic hardware when more
efficient. These two classes of optimizations are
prototyped in the GeCoS source-to-source compiler and
evaluated on the Polybench and EEMBC benchmark suites.
Latency is reduced by up to 93\%, and resource usage is
reduced by up to 58\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Song:2020:IME,
author = "Yang Song and Bill Lin",
title = "Improving Memory Efficiency in Heterogeneous {MPSoCs}
through Row-Buffer Locality-aware Forwarding",
journal = j-TACO,
volume = "17",
number = "1",
pages = "6:1--6:26",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377149",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377149",
abstract = "In heterogeneous multicore systems, the memory
subsystem plays a critical role, since most
core-to-core communications are conducted through the
main memory. Memory efficiency has a substantial impact
on system performance. Although memory traffic from
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2020:MBS,
author = "Hao Wu and Weizhi Liu and Huanxin Lin and Cho-Li
Wang",
title = "A Model-Based Software Solution for Simultaneous
Multiple Kernels on {GPUs}",
journal = j-TACO,
volume = "17",
number = "1",
pages = "7:1--7:26",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377138",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377138",
abstract = "As a critical computing resource in multiuser systems
such as supercomputers, data centers, and cloud
services, a GPU contains multiple compute units (CUs).
GPU Multitasking is an intuitive solution to
underutilization in GPGPU computing. Recently
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shi:2020:OSB,
author = "Xuanhua Shi and Wei Liu and Ligang He and Hai Jin and
Ming Li and Yong Chen",
title = "Optimizing the {SSD} Burst Buffer by Traffic
Detection",
journal = j-TACO,
volume = "17",
number = "1",
pages = "8:1--8:26",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377705",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:30:23 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377705",
abstract = "Currently, HPC storage systems still use hard disk
drive (HDD) as their dominant storage device. Solid
state drive (SSD) is widely deployed as the buffer to
HDDs. Burst buffer has also been proposed to manage the
SSD buffering of bursty write requests.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kalra:2020:ACB,
author = "Charu Kalra and Fritz Previlon and Norm Rubin and
David Kaeli",
title = "{ArmorAll}: Compiler-based Resilience Targeting {GPU}
Applications",
journal = j-TACO,
volume = "17",
number = "2",
pages = "9:1--9:24",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3382132",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3382132",
abstract = "The vulnerability of GPUs to soft errors has become a
first-class design concern as they are increasingly
being used in accuracy-sensitive and safety-critical
domains. Existing solutions used to enhance the
reliability of GPUs come with significant overhead in
terms of area, power, and/or performance. In this
article, we propose ArmorAll, a light-weight, adaptive,
selective, and portable software solution to protect
GPUs against soft errors. ArmorAll consists of a set of
purely compiler-based redundancy schemes designed to
optimize instruction duplication on GPUs, thereby
enabling much more reliable execution. The choice of
the scheme determines the subset of instructions that
must be duplicated in an application, allowing
adaptable fault coverage for different applications.
ArmorAll can intelligently select a redundancy scheme
that provides the best coverage to an application with
an accuracy of 91.7\%. The high coverage provided by
ArmorAll comes at an average improvement of 64.5\% in
runtime",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cherubin:2020:DPA,
author = "Stefano Cherubin and Daniele Cattaneo and Michele
Chiari and Giovanni Agosta",
title = "Dynamic Precision Autotuning with {TAFFO}",
journal = j-TACO,
volume = "17",
number = "2",
pages = "10:1--10:26",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3388785",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3388785",
abstract = "Many classes of applications, both in the embedded and
high performance domains, can trade off the accuracy of
the computed results for computation performance. One
way to achieve such a trade-off is precision
tuning-that is, to modify the data types used for the
computation by reducing the bit width, or by changing
the representation from floating point to fixed point.
We present a methodology for high-accuracy dynamic
precision tuning based on the identification of input
classes (i.e., classes of input datasets that benefit
from similar optimizations). When a new input region is
detected, the application kernels are re-compiled on
the fly with the appropriate selection of parameters.
In this way, we obtain a continuous optimization
approach that enables the exploitation of the reduced
precision computation while progressively exploring the
solution space, thus reducing the time required by
compilation overheads. We provide tools to support the
automation of the runtime part of the solution, leaving
to the user only the task of identifying the input
classes. Our approach provides a significant
performance boost (up to 320\%) on the typical
approximate computing benchmarks, without meaningfully
affecting the accuracy of the result, since the error
remains always below 3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Erdem:2020:RDS,
author = "Ahmet Erdem and Cristina Silvano and Thomas Boesch and
Andrea Carlo Ornstein and Surinder-Pal Singh and
Giuseppe Desoli",
title = "Runtime Design Space Exploration and Mapping of
{DCNNs} for the Ultra-Low-Power {Orlando SoC}",
journal = j-TACO,
volume = "17",
number = "2",
pages = "11:1--11:25",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3379933",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3379933",
abstract = "Recent trends in deep convolutional neural networks
(DCNNs) impose hardware accelerators as a viable
solution for computer vision and speech recognition.
The Orlando SoC architecture from STMicroelectronics
targets exactly this class of problems by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sabet:2020:RAU,
author = "Amir Hossein Nodehi Sabet and Junqiao Qiu and Zhijia
Zhao and Sriram Krishnamoorthy",
title = "Reliability Analysis for Unreliable {FSM}
Computations",
journal = j-TACO,
volume = "17",
number = "2",
pages = "12:1--12:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377456",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377456",
abstract = "Finite State Machines (FSMs) are fundamental in both
hardware design and software development. However, the
reliability of FSM computations remains poorly
understood. Existing reliability analyses are mainly
designed for generic computations and are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xue:2020:NIA,
author = "Jiachen Xue and T. N. Vijaykumar and Mithuna
Thottethodi",
title = "Network Interface Architecture for Remote Indirect
Memory Access {(RIMA)} in Datacenters",
journal = j-TACO,
volume = "17",
number = "2",
pages = "13:1--13:22",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3374215",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3374215",
abstract = "Remote Direct Memory Access (RDMA) fabrics such as
InfiniBand and Converged Ethernet report latency
shorter by a factor of 50 than TCP. As such, RDMA is a
potential replacement for TCP in datacenters (DCs)
running low-latency applications, such as Web
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2020:CFS,
author = "Qinggang Wang and Long Zheng and Jieshan Zhao and
Xiaofei Liao and Hai Jin and Jingling Xue",
title = "A Conflict-free Scheduler for High-performance Graph
Processing on Multi-pipeline {FPGAs}",
journal = j-TACO,
volume = "17",
number = "2",
pages = "14:1--14:26",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3390523",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3390523",
abstract = "FPGA-based graph processing accelerators are nowadays
equipped with multiple pipelines for hardware
acceleration of graph computations. However, their
multi-pipeline efficiency can suffer greatly from the
considerable overheads caused by the read/write
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tino:2020:SXE,
author = "Anita Tino and Caroline Collange and Andr{\'e}
Seznec",
title = "{SIMT-X}: Extending Single-Instruction Multi-Threading
to Out-of-Order Cores",
journal = j-TACO,
volume = "17",
number = "2",
pages = "15:1--15:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3392032",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3392032",
abstract = "This work introduces Single Instruction Multi-Thread
Express (SIMT-X), a general-purpose Central Processing
Unit (CPU) microarchitecture that enables Graphics
Processing Units (GPUs)-style SIMT execution across
multiple threads of the same program for high
throughput, while retaining the latency benefits of
out-of-order execution, and the programming convenience
of homogeneous multi-thread processors. SIMT-X
leverages the existing Single Instruction Multiple Data
(SIMD) back-end to provide CPU/GPU-like processing on a
single core with minimal overhead. We demonstrate that
although SIMT-X invokes a restricted form of
Out-of-Order (OoO), the microarchitecture successfully
captures a majority of the benefits of aggressive OoO
execution using at most two concurrent register
mappings per architectural register, while addressing
issues of partial dependencies and supporting a
general-purpose Instruction Set Architecture (ISA).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kaeli:2020:EME,
author = "Dave Kaeli",
title = "Editorial: a Message from the {Editor-in-Chief}",
journal = j-TACO,
volume = "17",
number = "3",
pages = "16:1--16:2",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409369",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3409369",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rangan:2020:ZEZ,
author = "Ram Rangan and Mark W. Stephenson and Aditya Ukarande
and Shyam Murthy and Virat Agarwal and Marc
Blackstein",
title = "{Zeroploit}: Exploiting Zero Valued Operands in
Interactive Gaming Applications",
journal = j-TACO,
volume = "17",
number = "3",
pages = "17:1--17:26",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3394284",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3394284",
abstract = "In this article, we first characterize register
operand value locality in shader programs of modern
gaming applications and observe that there is a high
likelihood of one of the register operands of several
multiply, logical-and, and similar operations
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Adamek:2020:GFC,
author = "Karel Ad{\'a}mek and Sofia Dimoudi and Mike Giles and
Wesley Armour",
title = "{GPU} Fast Convolution via the Overlap-and-Save Method
in Shared Memory",
journal = j-TACO,
volume = "17",
number = "3",
pages = "18:1--18:20",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3394116",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3394116",
abstract = "We present an implementation of the overlap-and-save
method, a method for the convolution of very long
signals with short response functions, which is
tailored to GPUs. We have implemented several FFT
algorithms (using the CUDA programming language),
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Das:2020:FER,
author = "Arnab Das and Sriram Krishnamoorthy and Ian Briggs and
Ganesh Gopalakrishnan and Ramakrishna Tipireddy",
title = "{FPDetect}: Efficient Reasoning About Stencil Programs
Using Selective Direct Evaluation",
journal = j-TACO,
volume = "17",
number = "3",
pages = "19:1--19:27",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3402451",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3402451",
abstract = "We present FPDetect, a low-overhead approach for
detecting logical errors and soft errors affecting
stencil computations without generating false
positives. We develop an offline analysis that tightly
estimates the number of floating-point bits \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abdelrahman:2020:CSH,
author = "Tarek S. Abdelrahman",
title = "Cooperative Software-hardware Acceleration of
{$K$}-means on a Tightly Coupled {CPU--FPGA} System",
journal = j-TACO,
volume = "17",
number = "3",
pages = "20:1--20:24",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3406114",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3406114",
abstract = "We consider software-hardware acceleration of K-means
clustering on the Intel Xeon+FPGA platform. We design a
pipelined accelerator for K-means and combine it with
CPU threads to assess performance benefits of (1)
acceleration when data are only \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2020:SBP,
author = "Jaekyu Lee and Yasuo Ishii and Dam Sunwoo",
title = "Securing Branch Predictors with Two-Level Encryption",
journal = j-TACO,
volume = "17",
number = "3",
pages = "21:1--21:25",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3404189",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3404189",
abstract = "Modern processors rely on various speculative
mechanisms to meet performance demand. Branch
predictors are one of the most important
micro-architecture components to deliver performance.
However, they have been under heavy scrutiny because of
recent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cerina:2020:EDO,
author = "L. Cerina and M. D. Santambrogio and G. Franco and C.
Gallicchio and A. Micheli",
title = "{EchoBay}: Design and Optimization of Echo State
Networks under Memory and Time Constraints",
journal = j-TACO,
volume = "17",
number = "3",
pages = "22:1--22:24",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3404993",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3404993",
abstract = "The increase in computational power of embedded
devices and the latency demands of novel applications
brought a paradigm shift on how and where the
computation is performed. Although AI inference is
slowly moving from the cloud to end-devices with
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sioutas:2020:SSH,
author = "Savvas Sioutas and Sander Stuijk and Twan Basten and
Henk Corporaal and Lou Somers",
title = "Schedule Synthesis for {Halide} Pipelines on {GPUs}",
journal = j-TACO,
volume = "17",
number = "3",
pages = "23:1--23:25",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3406117",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3406117",
abstract = "The Halide DSL and compiler have enabled
high-performance code generation for image processing
pipelines targeting heterogeneous architectures through
the separation of algorithmic description and
optimization schedule. However, automatic schedule
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huzaifa:2020:IKR,
author = "Muhammad Huzaifa and Johnathan Alsop and Abdulrahman
Mahmoud and Giordano Salvador and Matthew D. Sinclair
and Sarita V. Adve",
title = "Inter-kernel Reuse-aware Thread Block Scheduling",
journal = j-TACO,
volume = "17",
number = "3",
pages = "24:1--24:27",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3406538",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 28 12:02:00 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3406538",
abstract = "As GPUs have become more programmable, their
performance and energy benefits have made them
increasingly popular. However, while GPU compute units
continue to improve in performance, on-chip memories
lag behind and data accesses are becoming \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jafri:2021:RTC,
author = "Syed M. A. H. Jafri and Hasan Hassan and Ahmed Hemani
and Onur Mutlu",
title = "Refresh Triggered Computation: Improving the Energy
Efficiency of Convolutional Neural Network
Accelerators",
journal = j-TACO,
volume = "18",
number = "1",
pages = "2:1--2:29",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3417708",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3417708",
abstract = "To employ a Convolutional Neural Network (CNN) in an
energy-constrained embedded system, it is critical for
the CNN implementation to be highly energy efficient.
Many recent studies propose CNN accelerator
architectures with custom computation units \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abera:2021:PET,
author = "Solomon Abera and M. Balakrishnan and Anshul Kumar",
title = "Performance-Energy Trade-off in Modern {CMPs}",
journal = j-TACO,
volume = "18",
number = "1",
pages = "3:1--3:26",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3427092",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3427092",
abstract = "Chip multiprocessors (CMPs) are ubiquitous in all
computing systems ranging from high-end servers to
mobile devices. In these systems, energy consumption is
a critical design constraint as it constitutes the most
significant operating cost for computing \ldots{}.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mehrabi:2021:BOE,
author = "Atefeh Mehrabi and Aninda Manocha and Benjamin C. Lee
and Daniel J. Sorin",
title = "{Bayesian} Optimization for Efficient Accelerator
Synthesis",
journal = j-TACO,
volume = "18",
number = "1",
pages = "4:1--4:25",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3427377",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3427377",
abstract = "Accelerator design is expensive due to the effort
required to understand an algorithm and optimize the
design. Architects have embraced two technologies to
reduce costs. High-level synthesis automatically
generates hardware from code. Reconfigurable \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2021:IRA,
author = "Minsu Kim and Jeong-Keun Park and Soo-Mook Moon",
title = "Irregular Register Allocation for Translation of
Test-pattern Programs",
journal = j-TACO,
volume = "18",
number = "1",
pages = "5:1--5:23",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3427378",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3427378",
abstract = "Test-pattern programs are for testing DRAM memory
chips. They run on a special embedded system called
automated test equipment (ATE). Each ATE manufacturer
provides its own programming language, which is mostly
low level, thus accessing the registers in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nematollahi:2021:ENN,
author = "Negin Nematollahi and Mohammad Sadrosadati and Hajar
Falahati and Marzieh Barkhordar and Mario Paulo Drumond
and Hamid Sarbazi-Azad and Babak Falsafi",
title = "Efficient Nearest-Neighbor Data Sharing in {GPUs}",
journal = j-TACO,
volume = "18",
number = "1",
pages = "6:1--6:26",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3429981",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3429981",
abstract = "Stencil codes (a.k.a. nearest-neighbor computations)
are widely used in image processing, machine learning,
and scientific applications. Stencil codes incur
nearest-neighbor data exchange because the value of
each point in the structured grid is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Braun:2021:SMP,
author = "Lorenz Braun and Sotirios Nikas and Chen Song and
Vincent Heuveline and Holger Fr{\"o}ning",
title = "A Simple Model for Portable and Fast Prediction of
Execution Time and Power Consumption of {GPU} Kernels",
journal = j-TACO,
volume = "18",
number = "1",
pages = "7:1--7:25",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3431731",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3431731",
abstract = "Characterizing compute kernel execution behavior on
GPUs for efficient task scheduling is a non-trivial
task. We address this with a simple model enabling
portable and fast predictions among different GPUs
using only hardware-independent features. This
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mettler:2021:DHM,
author = "Marcel Mettler and Daniel Mueller-Gritschneder and Ulf
Schlichtmann",
title = "A Distributed Hardware Monitoring System for Runtime
Verification on Multi-Tile {MPSoCs}",
journal = j-TACO,
volume = "18",
number = "1",
pages = "8:1--8:25",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3430699",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3430699",
abstract = "Exhaustive verification techniques do not scale with
the complexity of today's multi-tile Multi-processor
Systems-on-chip (MPSoCs). Hence, runtime verification
(RV) has emerged as a complementary method, which
verifies the correct behavior of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2021:EPO,
author = "Yu Emma Wang and Carole-Jean Wu and Xiaodong Wang and
Kim Hazelwood and David Brooks",
title = "Exploiting Parallelism Opportunities with Deep
Learning Frameworks",
journal = j-TACO,
volume = "18",
number = "1",
pages = "9:1--9:23",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3431388",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3431388",
abstract = "State-of-the-art machine learning frameworks support a
wide variety of design features to enable a flexible
machine learning programming interface and to ease the
programmability burden on machine learning developers.
Identifying and using a performance-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tavarageri:2021:PPO,
author = "Sanket Tavarageri and Alexander Heinecke and Sasikanth
Avancha and Bharat Kaul and Gagandeep Goyal and
Ramakrishna Upadrasta",
title = "{PolyDL}: Polyhedral Optimizations for Creation of
High-performance {DL} Primitives",
journal = j-TACO,
volume = "18",
number = "1",
pages = "11:1--11:27",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3433103",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3433103",
abstract = "Deep Neural Networks (DNNs) have revolutionized many
aspects of our lives. The use of DNNs is becoming
ubiquitous, including in software for image
recognition, speech recognition, speech synthesis,
language translation, to name a few. The training of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yadalam:2021:SXS,
author = "Sujay Yadalam and Vinod Ganapathy and Arkaprava Basu",
title = "{SG XL}: Security and Performance for Enclaves Using
Large Pages",
journal = j-TACO,
volume = "18",
number = "1",
pages = "12:1--12:25",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3433983",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3433983",
abstract = "Intel's SGX architecture offers clients of public
cloud computing platforms the ability to create
hardware-protected enclaves whose contents are
protected from privileged system software. However, SGX
relies on system software for enclave memory
\ldots{}.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kalaitzidis:2021:LVE,
author = "Kleovoulos Kalaitzidis and Andr{\'e} Seznec",
title = "Leveraging Value Equality Prediction for Value
Speculation",
journal = j-TACO,
volume = "18",
number = "1",
pages = "13:1--13:20",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3436821",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3436821",
abstract = "Value Prediction (VP) has recently been gaining
interest in the research community, since prior work
has established practical solutions for its
implementation that provide meaningful performance
gains. A constant challenge of contemporary
context-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Singh:2021:SSM,
author = "Abhishek Singh and Shail Dave and Pantea Zardoshti and
Robert Brotzman and Chao Zhang and Xiaochen Guo and
Aviral Shrivastava and Gang Tan and Michael Spear",
title = "{SPX64}: a Scratchpad Memory for General-purpose
Microprocessors",
journal = j-TACO,
volume = "18",
number = "1",
pages = "14:1--14:26",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3436730",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3436730",
abstract = "General-purpose computing systems employ memory
hierarchies to provide the appearance of a single
large, fast, coherent memory. In special-purpose CPUs,
programmers manually manage distinct, non-coherent
scratchpad memories. In this article, we combine
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Labini:2021:APM,
author = "Paolo Sylos Labini and Marco Cianfriglia and Damiano
Perri and Osvaldo Gervasi and Grigori Fursin and Anton
Lokhmotov and Cedric Nugteren and Bruno Carpentieri and
Fabiana Zollo and Flavio Vella",
title = "On the Anatomy of Predictive Models for Accelerating
{GPU} Convolution Kernels and Beyond",
journal = j-TACO,
volume = "18",
number = "1",
pages = "16:1--16:24",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3434402",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 16 06:46:44 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3434402",
abstract = "Efficient HPC libraries often expose multiple tunable
parameters, algorithmic implementations, or a
combination of them, to provide optimized routines. The
optimal parameters and algorithmic choices may depend
on input properties such as the shapes of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Voss:2021:PRS,
author = "Nils Voss and Bastiaan Kwaadgras and Oskar Mencer and
Wayne Luk and Georgi Gaydadjiev",
title = "On Predictable Reconfigurable System Design",
journal = j-TACO,
volume = "18",
number = "2",
pages = "17:1--17:28",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3436995",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3436995",
abstract = "We propose a design methodology to facilitate rigorous
development of complex applications targeting
reconfigurable hardware. Our methodology relies on
analytical estimation of system performance and area
utilisation for a given specific application and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kaushik:2021:GHP,
author = "Anirudh Mohan Kaushik and Gennady Pekhimenko and Hiren
Patel",
title = "{Gretch}: a Hardware Prefetcher for Graph Analytics",
journal = j-TACO,
volume = "18",
number = "2",
pages = "18:1--18:25",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3439803",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3439803",
abstract = "Data-dependent memory accesses (DDAs) pose an
important challenge for high-performance graph
analytics (GA). This is because such memory accesses do
not exhibit enough temporal and spatial locality
resulting in low cache performance. Prior efforts that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ho:2021:GFD,
author = "Nhut-Minh Ho and Himeshi {De Silva} and Weng-Fai
Wong",
title = "{GRAM}: a Framework for Dynamically Mixing Precisions
in {GPU} Applications",
journal = j-TACO,
volume = "18",
number = "2",
pages = "19:1--19:24",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3441830",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3441830",
abstract = "This article presents GRAM (GPU-based Runtime Adaption
for Mixed-precision) a framework for the effective use
of mixed precision arithmetic for CUDA programs. Our
method provides a fine-grain tradeoff between output
error and performance. It can create many variants that
satisfy different accuracy requirements by assigning
different groups of threads to different precision
levels adaptively at runtime. To widen the range of
applications that can benefit from its approximation,
GRAM comes with an optional half-precision approximate
math library. Using GRAM, we can trade off precision
for any performance improvement of up to 540\%,
depending on the application and accuracy
requirement.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Biswas:2021:CSI,
author = "Arnab Kumar Biswas",
title = "Cryptographic Software {IP} Protection without
Compromising Performance or Timing Side-channel
Leakage",
journal = j-TACO,
volume = "18",
number = "2",
pages = "20:1--20:20",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3443707",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3443707",
abstract = "Program obfuscation is a widely used cryptographic
software intellectual property (IP) protection
technique against reverse engineering attacks in
embedded systems. However, very few works have studied
the impact of combining various obfuscation techniques
on the obscurity (difficulty of reverse engineering)
and performance (execution time) of obfuscated
programs. In this article, we propose a Genetic
Algorithm (GA)-based framework that not only optimizes
obscurity and performance of obfuscated cryptographic
programs, but it also ensures very low timing
side-channel leakage. Our proposed Timing Side Channel
Sensitive Program Obfuscation Optimization Framework
(TSC-SPOOF) determines the combination of obfuscation
transformation functions that produce optimized
obfuscated programs with preferred optimization
parameters. In particular, TSC-SPOOF employs normalized
compression distance (NCD) and channel capacity to
measure obscurity and timing side-channel leakage,
respectively. We also use RISC-V rocket core running on
a Xilinx Zynq FPGA device as part of our framework to
obtain realistic results. The experimental results
clearly show that our proposed solution leads to
cryptographic programs with lower execution time,
higher obscurity, and lower timing side-channel leakage
than unguided obfuscation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{France-Pillois:2021:NIT,
author = "Maxime France-Pillois and J{\'e}r{\^o}me Martin and
Fr{\'e}d{\'e}ric Rousseau",
title = "A Non-Intrusive Tool Chain to Optimize {MPSoC}
End-to-End Systems",
journal = j-TACO,
volume = "18",
number = "2",
pages = "21:1--21:22",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3445030",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3445030",
abstract = "Multi-core systems are now found in many electronic
devices. But does current software design fully
leverage their capabilities? The complexity of the
hardware and software stacks in these platforms
requires software optimization with end-to-end
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2021:GTU,
author = "Pengyu Wang and Jing Wang and Chao Li and Jianzong
Wang and Haojin Zhu and Minyi Guo",
title = "{Grus}: Toward Unified-memory-efficient
High-performance Graph Processing on {GPU}",
journal = j-TACO,
volume = "18",
number = "2",
pages = "22:1--22:25",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3444844",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3444844",
abstract = "Today's GPU graph processing frameworks face
scalability and efficiency issues as the graph size
exceeds GPU-dedicated memory limit. Although recent
GPUs can over-subscribe memory with Unified Memory
(UM), they incur significant overhead when handling
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Izadpanah:2021:PPT,
author = "Ramin Izadpanah and Christina Peterson and Yan Solihin
and Damian Dechev",
title = "{PETRA}: Persistent Transactional Non-blocking Linked
Data Structures",
journal = j-TACO,
volume = "18",
number = "2",
pages = "23:1--23:26",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446391",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3446391",
abstract = "Emerging byte-addressable Non-Volatile Memories (NVMs)
enable persistent memory where process state can be
recovered after crashes. To enable applications to rely
on persistent data, durable data structures with
failure-atomic operations have been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hassan:2021:RCM,
author = "Muhammad Hassan and Chang Hyun Park and David
Black-Schaffer",
title = "A Reusable Characterization of the Memory System
Behavior of {SPEC2017} and {SPEC2006}",
journal = j-TACO,
volume = "18",
number = "2",
pages = "24:1--24:20",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446200",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Mar 20 17:25:10 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3446200",
abstract = "The SPEC CPU Benchmarks are used extensively for
evaluating and comparing improvements to computer
systems. This ubiquity makes characterization critical
for researchers to understand the bottlenecks the
benchmarks do and do not expose and where new designs
should and should not be expected to show impact.
However, in characterization there is a tradeoff
between accuracy and reusability: The more precisely we
characterize a benchmark's performance on a given
system, the less usable it is across different
micro-architectures and varying memory configurations.
For SPEC, most existing characterizations include
system-specific effects (e.g., via performance
counters) and/or only look at aggregate behavior (e.g.,
averages over the full application execution). While
such approaches simplify characterization, they make it
difficult to separate the applications intrinsic
behavior from the system-specific effects and/or lose
the diverse phase-based behaviors.\par
In this work we focus on characterizing the
applications intrinsic memory behaviour by isolating
them from micro-architectural configuration specifics.
We do this by providing a simplified generic system
model that evaluates the applications memory behavior
across multiple cache sizes, with and without
prefetching, and over time. The resulting
characterization can be reused across a range of
systems to understand application behavior and allow us
to see how frequently different behaviors occur. We use
this approach to compare the SPEC 2006 and 2017 suites,
providing insight into their memory system behaviour
beyond previous system-specific and/or aggregate
results. We demonstrate the ability to use this
characterization in different contexts by showing a
portion of the SPEC 2017 benchmark suite that could
benefit from giga-scale caches, despite aggregate
results indicating otherwise.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tiwari:2021:PCP,
author = "Sugandha Tiwari and Neel Gala and Chester Rebeiro and
V. Kamakoti",
title = "{PERI}: a Configurable Posit Enabled {RISC-V} Core",
journal = j-TACO,
volume = "18",
number = "3",
pages = "25:1--25:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446210",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3446210",
abstract = "Owing to the failure of Dennard's scaling, the past
decade has seen a steep growth of prominent new
paradigms leveraging opportunities in computer
architecture. Two technologies of interest are Posit
and RISC-V. Posit was introduced in mid-2017 as a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Charitopoulos:2021:MDC,
author = "George Charitopoulos and Dionisios N. Pnevmatikatos
and Georgi Gaydadjiev",
title = "{MC-DeF}: Creating Customized {CGRAs} for Dataflow
Applications",
journal = j-TACO,
volume = "18",
number = "3",
pages = "26:1--26:25",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447970",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3447970",
abstract = "Executing complex scientific applications on
Coarse-Grain Reconfigurable Arrays (CGRAs) promises
improvements in execution time and/or energy
consumption compared to optimized software
implementations or even fully customized hardware
solutions. Typical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Borbon:2021:APB,
author = "Jose M. Rodriguez Borbon and Junjie Huang and Bryan M.
Wong and Walid Najjar",
title = "Acceleration of Parallel-Blocked {$ Q R $}
Decomposition of Tall-and-Skinny Matrices on {FPGAs}",
journal = j-TACO,
volume = "18",
number = "3",
pages = "27:1--27:25",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447775",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3447775",
abstract = "$ Q R $ decomposition is one of the most useful
factorization kernels in modern numerical linear
algebra algorithms. In particular, the decomposition of
tall-and-skinny matrices (TSMs) has major applications
in areas including scientific computing, machine
learning, image processing, wireless networks, and
numerical methods. Traditionally, CPUs and GPUs have
achieved better throughput on these applications by
using large cache hierarchies and compute cores running
at a high frequency, leading to high power consumption.
With the advent of heterogeneous platforms, however,
FPGAs are emerging as a promising viable alternative.
In this work, we propose a high-throughput FPGA-based
engine that has a very high computational efficiency
(ratio of achieved to peak throughput) compared to
similar $ Q R $ solvers running on FPGAs. Although
comparable $ Q R $ solvers achieve an efficiency of
36\%, our design exhibits an efficiency of 54\%. For
TSMs, our experimental results show that our design can
outperform highly optimized $ Q R $ solvers running on
CPUs and GPUs. For TSMs with more than 50K rows, our
design outperforms the Intel MKL solver running on an
Intel quad-core processor by a factor of $ 1.5 \times
$. For TSMs containing 256 columns or less, our design
outperforms the NVIDIA CUBLAS solver running on a K40
GPU by a factor of $ 3.0 \times $. In addition to being
fast, our design is energy efficient competing
platforms execute up to 0.6 GFLOPS/Joule, whereas our
design executes more than 1.0 GFLOPS/Joule.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stokes:2021:DMR,
author = "Michael Stokes and David Whalley and Soner Onder",
title = "Decreasing the Miss Rate and Eliminating the
Performance Penalty of a Data Filter Cache",
journal = j-TACO,
volume = "18",
number = "3",
pages = "28:1--28:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3449043",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3449043",
abstract = "While data filter caches (DFCs) have been shown to be
effective at reducing data access energy, they have not
been adopted in processors due to the associated
performance penalty caused by high DFC miss rates. In
this article, we present a design that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Akram:2021:PEI,
author = "Shoaib Akram",
title = "Performance Evaluation of {Intel Optane} Memory for
Managed Workloads",
journal = j-TACO,
volume = "18",
number = "3",
pages = "29:1--29:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3451342",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3451342",
abstract = "Intel Optane memory offers non-volatility, byte
addressability, and high capacity. It suits managed
workloads that prefer large main memory heaps. We
investigate Optane as the main memory for managed
(Java) workloads, focusing on performance scalability.
As the workload (core count) increases, we note
Optane's performance relative to DRAM. A few workloads
incur a slight slowdown on Optane memory, which helps
conserve limited DRAM capacity. Unfortunately, other
workloads scale poorly beyond a few core
counts.\par
This article investigates scaling bottlenecks for Java
workloads on Optane memory, analyzing the application,
runtime, and microarchitectural interactions. Poorly
scaling workloads allocate objects rapidly and access
objects in Optane memory frequently. These
characteristics slow down the mutator and substantially
slow down garbage collection (GC). At the
microarchitecture level, load, store, and instruction
miss penalties rise. To regain performance, we
partition heaps across DRAM and Optane memory, a hybrid
that scales considerably better than Optane alone. We
exploit state-of-the-art GC approaches to partition
heaps. Unfortunately, existing GC approaches needlessly
waste DRAM capacity because they ignore runtime
behavior.\par
This article also introduces performance impact-guided
memory allocation (PIMA) for hybrid memories. PIMA
maximizes Optane utilization, allocating in DRAM only
if it improves performance. It estimates the
performance impact of allocating heaps in either memory
type by sampling. We target PIMA at graph analytics
workloads, offering a novel performance estimation
method and detailed evaluation. PIMA identifies
workload phases that benefit from DRAM with high
(94.33\%) accuracy, incurring only a 2\% sampling
overhead. PIMA operates stand-alone or combines with
prior approaches to offer new performance versus DRAM
capacity trade-offs. This work opens up Optane memory
to a real-life role as the main memory for Java
workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lu:2021:GAG,
author = "Yashuai L{\"u} and Hui Guo and Libo Huang and Qi Yu
and Li Shen and Nong Xiao and Zhiying Wang",
title = "{GraphPEG}: Accelerating Graph Processing on {GPUs}",
journal = j-TACO,
volume = "18",
number = "3",
pages = "30:1--30:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3450440",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3450440",
abstract = "Due to massive thread-level parallelism, GPUs have
become an attractive platform for accelerating
large-scale data parallel computations, such as graph
processing. However, achieving high performance for
graph processing with GPUs is non-trivial. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Omar:2021:PSH,
author = "Hamza Omar and Omer Khan",
title = "{PRISM}: Strong Hardware Isolation-based Soft-Error
Resilient Multicore Architecture with High Performance
and Availability at Low Hardware Overheads",
journal = j-TACO,
volume = "18",
number = "3",
pages = "31:1--31:25",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3450523",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3450523",
abstract = "Multicores increasingly deploy safety-critical
parallel applications that demand resiliency against
soft-errors to satisfy the safety standards. However,
protection against these errors is challenging due to
complex communication and data access \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tripathy:2021:PLG,
author = "Devashree Tripathy and Amirali Abdolrashidi and Laxmi
Narayan Bhuyan and Liang Zhou and Daniel Wong",
title = "{PAVER}: Locality Graph-Based Thread Block Scheduling
for {GPUs}",
journal = j-TACO,
volume = "18",
number = "3",
pages = "32:1--32:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3451164",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3451164",
abstract = "The massive parallelism present in GPUs comes at the
cost of reduced L1 and L2 cache sizes per thread,
leading to serious cache contention problems such as
thrashing. Hence, the data access locality of an
application should be considered during thread
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Heirman:2021:ASE,
author = "Wim Heirman and Stijn Eyerman and Kristof {Du Bois}
and Ibrahim Hur",
title = "Automatic Sublining for Efficient Sparse Memory
Accesses",
journal = j-TACO,
volume = "18",
number = "3",
pages = "33:1--33:23",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3452141",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3452141",
abstract = "Sparse memory accesses, which are scattered accesses
to single elements of a large data structure, are a
challenge for current processor architectures. Their
lack of spatial and temporal locality and their
irregularity makes caches and traditional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cavus:2021:FKV,
author = "Mustafa Cavus and Mohammed Shatnawi and Resit Sendag
and Augustus K. Uht",
title = "Fast Key-Value Lookups with Node Tracker",
journal = j-TACO,
volume = "18",
number = "3",
pages = "34:1--34:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3452099",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3452099",
abstract = "Lookup operations for in-memory databases are heavily
memory bound, because they often rely on
pointer-chasing linked data structure traversals. They
also have many branches that are hard-to-predict due to
random key lookups. In this study, we show that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Song:2021:CRE,
author = "Weijia Song and Christina Delimitrou and Zhiming Shen
and Robbert {Van Renesse} and Hakim Weatherspoon and
Lotfi Benmohamed and Frederic {De Vaulx} and Charif
Mahmoudi",
title = "{CacheInspector}: Reverse Engineering Cache Resources
in Public Clouds",
journal = j-TACO,
volume = "18",
number = "3",
pages = "35:1--35:25",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457373",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
URL = "https://dl.acm.org/doi/10.1145/3457373",
abstract = "Infrastructure-as-a-Service cloud providers sell
virtual machines that are only specified in terms of
number of CPU cores, amount of memory, and I/O
throughput. Performance-critical aspects such as cache
sizes and memory latency are missing or reported in
ways that make them hard to compare across cloud
providers. It is difficult for users to adapt their
application's behavior to the available resources. In
this work, we aim to increase the visibility that cloud
users have into shared resources on public clouds.
Specifically, we present CacheInspector, a lightweight
runtime that determines the performance and allocated
capacity of shared caches on multi-tenant public
clouds. We validate CacheInspector's accuracy in a
controlled environment, and use it to study the
characteristics and variability of cache resources in
the cloud, across time, instances, availability
regions, and cloud providers. We show that
CacheInspector's output allows cloud users to tailor
their application's behavior, including their output
quality, to avoid suboptimal performance when resources
are scarce.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{RodriguesCarvalho:2021:UCC,
author = "Daniel {Rodrigues Carvalho} and Andr{\'e} Seznec",
title = "Understanding Cache Compression",
journal = j-TACO,
volume = "18",
number = "3",
pages = "36:1--36:27",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457207",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3457207",
abstract = "Hardware cache compression derives from
software-compression research; yet, its implementation
is not a straightforward translation, since it must
abide by multiple restrictions to comply with area,
power, and latency constraints. This study sheds light
on the challenges of adopting compression in cache
design ---from the shrinking of the data until its
physical placement. The goal of this article is not to
summarize proposals but to put in evidence the
solutions they employ to handle those challenges. An
in-depth description of the main characteristics of
multiple methods is provided, as well as criteria that
can be used as a basis for the assessment of such
schemes. It is expected that this article will ease the
understanding of decisions to be taken for the design
of compressed systems and provide directions for future
work.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Thuerck:2021:FRA,
author = "Daniel Thuerck and Nicolas Weber and Roberto Bifulco",
title = "{Flynn}'s Reconciliation: Automating the Register
Cache Idiom for Cross-accelerator Programming",
journal = j-TACO,
volume = "18",
number = "3",
pages = "37:1--37:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458357",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3458357",
abstract = "A large portion of the recent performance increase in
the High Performance Computing (HPC) and Machine
Learning (ML) domains is fueled by accelerator cards.
Many popular ML frameworks support accelerators by
organizing computations as a computational \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carvalho:2021:KRN,
author = "Jo{\~a}o P. L. {De Carvalho} and Braedy Kuzma and Ivan
Korostelev and Jos{\'e} Nelson Amaral and Christopher
Barton and Jos{\'e} Moreira and Guido Araujo",
title = "{KernelFaRer}: Replacing Native-Code Idioms with
High-Performance Library Calls",
journal = j-TACO,
volume = "18",
number = "3",
pages = "38:1--38:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3459010",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3459010",
abstract = "Well-crafted libraries deliver much higher performance
than code generated by sophisticated application
programmers using advanced optimizing compilers. When a
code pattern for which a well-tuned library
implementation exists is found in the source code
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Alves:2021:EAP,
author = "Ricardo Alves and Stefanos Kaxiras and David
Black-Schaffer",
title = "Early Address Prediction: Efficient Pipeline Prefetch
and Reuse",
journal = j-TACO,
volume = "18",
number = "3",
pages = "39:1--39:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458883",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jun 29 08:21:11 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3458883",
abstract = "Achieving low load-to-use latency with low energy and
storage overheads is critical for performance. Existing
techniques either prefetch into the pipeline (via
address prediction and validation) or provide data
reuse in the pipeline (via register sharing or L0
caches). These techniques provide a range of tradeoffs
between latency, reuse, and overhead.\par
In this work, we present a pipeline prefetching
technique that achieves state-of-the-art performance
and data reuse without additional data storage, data
movement, or validation overheads by adding address
tags to the register file. Our addition of register
file tags allows us to forward (reuse) load data from
the register file with no additional data movement,
keep the data alive in the register file beyond the
instruction s lifetime to increase temporal reuse, and
coalesce prefetch requests to achieve spatial reuse.
Further, we show that we can use the existing memory
order violation detection hardware to validate
prefetches and data forwards without additional
overhead.\par
Our design achieves the performance of existing
pipeline prefetching while also forwarding 32\% of the
loads from the register file (compared to 15\% in
state-of-the-art register sharing), delivering a 16\%
reduction in L1 dynamic energy (1.6\% total processor
energy), with an area overhead of less than 0.5\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Goswami:2021:TES,
author = "Kaustav Goswami and Dip Sankar Banerjee and Shirshendu
Das",
title = "Towards Enhanced System Efficiency while Mitigating
Row Hammer",
journal = j-TACO,
volume = "18",
number = "4",
pages = "40:1--40:26",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458749",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3458749",
abstract = "In recent years, DRAM-based main memories have become
susceptible to the Row Hammer (RH) problem, which
causes bits to flip in a row without accessing them
directly. Frequent activation of a row, called an
aggressor row, causes its adjacent rows' (victim) bits
to flip. The state-of-the-art solution is to refresh
the victim rows explicitly to prevent bit flipping.
There have been several proposals made to detect RH
attacks. These include both probabilistic as well as
deterministic counter-based methods. The technique of
handling RH attacks, however, remains the same. In this
work, we propose an efficient technique for handling
the RH problem. We show that the mechanism is agnostic
of the detection mechanism. Our RH handling technique
omits the necessity of refreshing the victim rows.
Instead, we use a small non-volatile Spin-Transfer
Torque Magnetic Random Access Memory (STTRAM) that
ensures no unnecessary refreshes of the victim rows on
the DRAM device and thus allowing more time for normal
applications in the same DRAM device. Our model relies
on the migration of the aggressor rows. This accounts
for removing blocking of the DRAM operations due to the
refreshing of victim rows incurred in the previous
solution. After extensive evaluation, we found that,
compared to the conventional RH mitigation techniques,
our model minimizes the blocking time of the memory
that is imposed due to explicit refreshing by an
average of 80.72\% in the worst-case scenario and
provides energy savings of about 15.82\% on average,
across different types of RH-based workloads. A lookup
table is necessary to pinpoint the location of a
particular row, which, when combined with the STTMRAM,
limits the storage overhead to 0.39\% of a 2 GB DRAM.
Our proposed model prevents repeated refreshing of the
same victim rows in different refreshing windows on the
DRAM device and leads to an efficient RH handling
technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Proficz:2021:AGA,
author = "Jerzy Proficz",
title = "All-gather Algorithms Resilient to Imbalanced Process
Arrival Patterns",
journal = j-TACO,
volume = "18",
number = "4",
pages = "41:1--41:22",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460122",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3460122",
abstract = "Two novel algorithms for the all-gather operation
resilient to imbalanced process arrival patterns (PATs)
are presented. The first one, Background Disseminated
Ring (BDR), is based on the regular parallel ring
algorithm often supplied in MPI implementations and
exploits an auxiliary background thread for early data
exchange from faster processes to accelerate the
performed all-gather operation. The other algorithm,
Background Sorted Linear synchronized tree with
Broadcast (BSLB), is built upon the already existing
PAP-aware gather algorithm, that is, Background Sorted
Linear Synchronized tree (BSLS), followed by a regular
broadcast distributing gathered data to all
participating processes. The background of the
imbalanced PAP subject is described, along with the PAP
monitoring and evaluation topics. An experimental
evaluation of the algorithms based on a proposed
mini-benchmark is presented. The mini-benchmark was
performed over 2,000 times in a typical HPC cluster
architecture with homogeneous compute nodes. The
obtained results are analyzed according to different
PATs, data sizes, and process numbers, showing that the
proposed optimization works well for various
configurations, is scalable, and can significantly
reduce the all-gather elapsed times, in our case, up to
factor 1.9 or 47\% in comparison with the best
state-of-the-art solution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2021:CMD,
author = "Rui Xu and Sheng Ma and Yaohua Wang and Xinhai Chen
and Yang Guo",
title = "Configurable Multi-directional Systolic Array
Architecture for Convolutional Neural Networks",
journal = j-TACO,
volume = "18",
number = "4",
pages = "42:1--42:24",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460776",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3460776",
abstract = "The systolic array architecture is one of the most
popular choices for convolutional neural network
hardware accelerators. The biggest advantage of the
systolic array architecture is its simple and efficient
design principle. Without complicated control
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Seo:2021:SAI,
author = "Wonik Seo and Sanghoon Cha and Yeonjae Kim and Jaehyuk
Huh and Jongse Park",
title = "{SLO}-Aware Inference Scheduler for Heterogeneous
Processors in Edge Platforms",
journal = j-TACO,
volume = "18",
number = "4",
pages = "43:1--43:26",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460352",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3460352",
abstract = "With the proliferation of applications with machine
learning (ML), the importance of edge platforms has
been growing to process streaming sensor, data locally
without resorting to remote servers. Such edge
platforms are commonly equipped with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Qureshi:2021:GXM,
author = "Yasir Mahmood Qureshi and William Andrew Simon and
Marina Zapater and Katzalin Olcoz and David Atienza",
title = "{Gem5-X}: a Many-core Heterogeneous Simulation
Platform for Architectural Exploration and
Optimization",
journal = j-TACO,
volume = "18",
number = "4",
pages = "44:1--44:27",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3461662",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3461662",
abstract = "The increasing adoption of smart systems in our daily
life has led to the development of new applications
with varying performance and energy constraints, and
suitable computing architectures need to be developed
for these new applications. In this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jung:2021:PPB,
author = "Tina Jung and Fabian Ritter and Sebastian Hack",
title = "{PICO}: a {Presburger} In-bounds Check Optimization
for Compiler-based Memory Safety Instrumentations",
journal = j-TACO,
volume = "18",
number = "4",
pages = "45:1--45:27",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460434",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3460434",
abstract = "Memory safety violations such as buffer overflows are
a threat to security to this day. A common solution to
ensure memory safety for C is code instrumentation.
However, this often causes high execution-time overhead
and is therefore rarely used in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sha:2021:LIA,
author = "Zhibing Sha and Jun Li and Lihao Song and Jiewen Tang
and Min Huang and Zhigang Cai and Lianju Qian and
Jianwei Liao and Zhiming Liu",
title = "Low {I/O} Intensity-aware Partial {GC} Scheduling to
Reduce Long-tail Latency in {SSDs}",
journal = j-TACO,
volume = "18",
number = "4",
pages = "46:1--46:25",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460433",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3460433",
abstract = "This article proposes a low I/O intensity-aware
scheduling scheme on garbage collection (GC) in SSDs
for minimizing the I/O long-tail latency to ensure I/O
responsiveness. The basic idea is to assemble partial
GC operations by referring to several \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Alam:2021:LPL,
author = "Syed Asad Alam and James Garland and David Gregg",
title = "Low-precision Logarithmic Number Systems: Beyond
Base-2",
journal = j-TACO,
volume = "18",
number = "4",
pages = "47:1--47:25",
month = dec,
year = "2021",
CODEN = "????",
DOI = "",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3461699",
abstract = "Logarithmic number systems (LNS) are used to represent
real numbers in many applications using a constant base
raised to a fixed-point exponent making its
distribution exponential. This greatly simplifies
hardware multiply, divide, and square root. LNS with
base-2 is most common, but in this article, we show
that for low-precision LNS the choice of base has a
significant impact.\par
We make four main contributions. First, LNS is not
closed under addition and subtraction, so the result is
approximate. We show that choosing a suitable base can
manipulate the distribution to reduce the average
error. Second, we show that low-precision LNS addition
and subtraction can be implemented efficiently in logic
rather than commonly used ROM lookup tables, the
complexity of which can be reduced by an appropriate
choice of base. A similar effect is shown where the
result of arithmetic has greater precision than the
input.Third, where input data from external sources is
not expected to be in LNS, we can reduce the conversion
error by selecting a LNS base to match the expected
distribution of the input. Thus, there is no one base
that gives the global optimum, and base selection is a
trade-off between different factors. Fourth, we show
that circuits realized in LNS require lower area and
power consumption for short word lengths",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Walden:2021:MIN,
author = "Candace Walden and Devesh Singh and Meenatchi
Jagasivamani and Shang Li and Luyi Kang and Mehdi
Asnaashari and Sylvain Dubois and Bruce Jacob and
Donald Yeung",
title = "Monolithically Integrating Non-Volatile Main Memory
over the Last-Level Cache",
journal = j-TACO,
volume = "18",
number = "4",
pages = "48:1--48:26",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462632",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3462632",
abstract = "Many emerging non-volatile memories are compatible
with CMOS logic, potentially enabling their integration
into a CPU's die. This article investigates such
monolithically integrated CPU-main memory chips. We
exploit non-volatile memories employing 3D \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tomei:2021:BSC,
author = "Matthew Tomei and Shomit Das and Mohammad Seyedzadeh
and Philip Bedoukian and Bradford Beckmann and Rakesh
Kumar and David Wood",
title = "Byte-Select Compression",
journal = j-TACO,
volume = "18",
number = "4",
pages = "49:1--49:27",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462209",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3462209",
abstract = "Cache-block compression is a highly effective
technique for both reducing accesses to lower levels in
the memory hierarchy (cache compression) and minimizing
data transfers (link compression). While many effective
cache-block compression algorithms have been proposed,
the design of these algorithms is largely ad hoc and
manual and relies on human recognition of patterns. In
this article, we take an entirely different approach.
We introduce a class of ``byte-select'' compression
algorithms, as well as an automated methodology for
generating compression algorithms in this class. We
argue that, based on upper bounds within the class, the
study of this class of byte-select algorithms has
potential to yield algorithms with better performance
than existing cache-block compression algorithms. The
upper bound we establish on the compression ratio is 2X
that of any existing algorithm. We then offer a
generalized representation of a subset of byte-select
compression algorithms and search through the resulting
space guided by a set of training data traces. Using
this automated process, we find efficient and effective
algorithms for various hardware applications. We find
that the resulting algorithms exploit novel patterns
that can inform future algorithm designs. The generated
byte-select algorithms are evaluated against a separate
set of traces and evaluations show that Byte-Select has
a 23\% higher compression ratio on average. While no
previous algorithm performs best for all our data sets
which include CPU and GPU applications, our generated
algorithms do. Using an automated hardware generator
for these algorithms, we show that their decompression
and compression latency is one and two cycles
respectively, much lower than any existing algorithm
with a competitive compression ratio.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2021:CHC,
author = "Cunlu Li and Dezun Dong and Shazhou Yang and Xiangke
Liao and Guangyu Sun and Yongheng Liu",
title = "{CIB-HIER}: Centralized Input Buffer Design in
Hierarchical High-radix Routers",
journal = j-TACO,
volume = "18",
number = "4",
pages = "50:1--50:21",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468062",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3468062",
abstract = "Hierarchical organization is widely used in high-radix
routers to enable efficient scaling to higher switch
port count. A general-purpose hierarchical router must
be symmetrically designed with the same input buffer
depth, resulting in a large amount of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gysi:2021:DSM,
author = "Tobias Gysi and Christoph M{\"u}ller and Oleksandr
Zinenko and Stephan Herhut and Eddie Davis and Tobias
Wicky and Oliver Fuhrer and Torsten Hoefler and Tobias
Grosser",
title = "Domain-Specific Multi-Level {IR} Rewriting for {GPU}:
The {Open Earth} Compiler for {GPU}-accelerated Climate
Simulation",
journal = j-TACO,
volume = "18",
number = "4",
pages = "51:1--51:23",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3469030",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3469030",
abstract = "Most compilers have a single core intermediate
representation (IR) (e.g., LLVM) sometimes complemented
with vaguely defined IR-like data structures. This IR
is commonly low-level and close to machine
instructions. As a result, optimizations relying on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zou:2021:SLE,
author = "An Zou and Huifeng Zhu and Jingwen Leng and Xin He and
Vijay Janapa Reddi and Christopher D. Gill and Xuan
Zhang",
title = "System-level Early-stage Modeling and Evaluation of
{IVR}-assisted Processor Power Delivery System",
journal = j-TACO,
volume = "18",
number = "4",
pages = "52:1--52:27",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468145",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3468145",
abstract = "Despite being employed in numerous efforts to improve
power delivery efficiency, the integrated voltage
regulator (IVR) approach has yet to be evaluated
rigorously and quantitatively in a full power delivery
system (PDS) setting. To fulfill this need, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Manocha:2021:GOD,
author = "Aninda Manocha and Tyler Sorensen and Esin Tureci and
Opeoluwa Matthews and Juan L. Arag{\'o}n and Margaret
Martonosi",
title = "{GraphAttack}: Optimizing Data Supply for Graph
Applications on In-Order Multicore Architectures",
journal = j-TACO,
volume = "18",
number = "4",
pages = "53:1--53:26",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3469846",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3469846",
abstract = "Graph structures are a natural representation of
important and pervasive data. While graph applications
have significant parallelism, their characteristic
pointer indirect loads to neighbor data hinder
scalability to large datasets on multicore systems.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Benz:2021:SAP,
author = "Joscha Benz and Oliver Bringmann",
title = "Scenario-Aware Program Specialization for Timing
Predictability",
journal = j-TACO,
volume = "18",
number = "4",
pages = "54:1--54:26",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473333",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3473333",
abstract = "The successful application of static program analysis
strongly depends on flow facts of a program such as
loop bounds, control-flow constraints, and operating
modes. This problem heavily affects the design of
real-time systems, since static program \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chakraborty:2021:WGC,
author = "Shounak Chakraborty and Magnus Sj{\"a}lander",
title = "{WaFFLe}: Gated Cache-Ways with Per-Core Fine-Grained
{DVFS} for Reduced On-Chip Temperature and Leakage
Consumption",
journal = j-TACO,
volume = "18",
number = "4",
pages = "55:1--55:25",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471908",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3471908",
abstract = "Managing thermal imbalance in contemporary chip
multi-processors (CMPs) is crucial in assuring
functional correctness of modern mobile as well as
server systems. Localized regions with high activity,
e.g., register files, ALUs, FPUs, and so on, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Srikanth:2021:SIC,
author = "Sriseshan Srikanth and Anirudh Jain and Thomas M.
Conte and Erik P. Debenedictis and Jeanine Cook",
title = "{SortCache}: Intelligent Cache Management for
Accelerating Sparse Data Workloads",
journal = j-TACO,
volume = "18",
number = "4",
pages = "56:1--56:24",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473332",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3473332",
abstract = "Sparse data applications have irregular access
patterns that stymie modern memory architectures.
Although hyper-sparse workloads have received
considerable attention in the past, moderately-sparse
workloads prevalent in machine learning applications,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Metzger:2021:DHT,
author = "Paul Metzger and Volker Seeker and Christian Fensch
and Murray Cole",
title = "Device Hopping: Transparent Mid-Kernel Runtime
Switching for Heterogeneous Systems",
journal = j-TACO,
volume = "18",
number = "4",
pages = "57:1--57:25",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471909",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3471909",
abstract = "Existing OS techniques for homogeneous many-core
systems make it simple for single and multithreaded
applications to migrate between cores. Heterogeneous
systems do not benefit so fully from this flexibility,
and applications that cannot migrate in mid-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2021:LED,
author = "Yu Zhang and Da Peng and Xiaofei Liao and Hai Jin and
Haikun Liu and Lin Gu and Bingsheng He",
title = "{LargeGraph}: an Efficient Dependency-Aware
{GPU}-Accelerated Large-Scale Graph Processing",
journal = j-TACO,
volume = "18",
number = "4",
pages = "58:1--58:24",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477603",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3477603",
abstract = "Many out-of-GPU-memory systems are recently designed
to support iterative processing of large-scale graphs.
However, these systems still suffer from long time to
converge because of inefficient propagation of active
vertices' new states along graph \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cilasun:2021:SNN,
author = "H{\"u}srev Cilasun and Salonik Resch and Zamshed I.
Chowdhury and Erin Olson and Masoud Zabihi and
Zhengyang Zhao and Thomas Peterson and Keshab K. Parhi
and Jian-Ping Wang and Sachin S. Sapatnekar and Ulya R.
Karpuzcu",
title = "Spiking Neural Networks in Spintronic Computational
{RAM}",
journal = j-TACO,
volume = "18",
number = "4",
pages = "59:1--59:21",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3475963",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3475963",
abstract = "Spiking Neural Networks (SNNs) represent a
biologically inspired computation model capable of
emulating neural computation in human brain and
brain-like structures. The main promise is very low
energy consumption. Classic Von Neumann architecture
based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ukarande:2022:LAC,
author = "Aditya Ukarande and Suryakant Patidar and Ram Rangan",
title = "Locality-Aware {CTA} Scheduling for Gaming
Applications",
journal = j-TACO,
volume = "19",
number = "1",
pages = "1:1--1:26",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477497",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3477497",
abstract = "The compute work rasterizer or the GigaThread Engine
of a modern NVIDIA GPU focuses on maximizing compute
work occupancy across all streaming multiprocessors in
a GPU while retaining design simplicity. In this
article, we identify the operational aspects \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2022:ICO,
author = "Hongzhi Liu and Jie Luo and Ying Li and Zhonghai Wu",
title = "Iterative Compilation Optimization Based on Metric
Learning and Collaborative Filtering",
journal = j-TACO,
volume = "19",
number = "1",
pages = "2:1--2:25",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3480250",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3480250",
abstract = "Pass selection and phase ordering are two critical
compiler auto-tuning problems. Traditional heuristic
methods cannot effectively address these NP-hard
problems especially given the increasing number of
compiler passes and diverse hardware architectures.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sasongko:2022:RFY,
author = "Muhammad Aditya Sasongko and Milind Chabbi and Mandana
Bagheri Marzijarani and Didem Unat",
title = "{ReuseTracker}: Fast Yet Accurate Multicore Reuse
Distance Analyzer",
journal = j-TACO,
volume = "19",
number = "1",
pages = "3:1--3:25",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3484199",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3484199",
abstract = "One widely used metric that measures data locality is
reuse distance -the number of unique memory locations
that are accessed between two consecutive accesses to a
particular memory location. State-of-the-art techniques
that measure reuse distance in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fu:2022:GDS,
author = "Yaosheng Fu and Evgeny Bolotin and Niladrish
Chatterjee and David Nellans and Stephen W. Keckler",
title = "{GPU} Domain Specialization via Composable On-Package
Architecture",
journal = j-TACO,
volume = "19",
number = "1",
pages = "4:1--4:23",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3484505",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3484505",
abstract = "As GPUs scale their low-precision matrix math
throughput to boost deep learning (DL) performance,
they upset the balance between math throughput and
memory system capabilities. We demonstrate that a
converged GPU design trying to address diverging
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2022:SBC,
author = "Daeyeal Lee and Bill Lin and Chung-Kuan Cheng",
title = "{SMT}-Based Contention-Free Task Mapping and
Scheduling on {$2$D\slash $3$D SMART NoC} with Mixed
Dimension-Order Routing",
journal = j-TACO,
volume = "19",
number = "1",
pages = "5:1--5:21",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3487018",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3487018",
abstract = "SMART NoCs achieve ultra-low latency by enabling
single-cycle multiple-hop transmission via bypass
channels. However, contention along bypass channels can
seriously degrade the performance of SMART NoCs by
breaking the bypass paths. Therefore, contention-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chatarasi:2022:MDC,
author = "Prasanth Chatarasi and Hyoukjun Kwon and Angshuman
Parashar and Michael Pellauer and Tushar Krishna and
Vivek Sarkar",
title = "{Marvel}: a Data-Centric Approach for Mapping Deep
Learning Operators on Spatial Accelerators",
journal = j-TACO,
volume = "19",
number = "1",
pages = "6:1--6:26",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485137",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3485137",
abstract = "A spatial accelerator's efficiency depends heavily on
both its mapper and cost models to generate optimized
mappings for various operators of DNN models. However,
existing cost models lack a formal boundary over their
input programs (operators) for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rieber:2022:JPL,
author = "Dennis Rieber and Axel Acosta and Holger Fr{\"o}ning",
title = "Joint Program and Layout Transformations to Enable
Convolutional Operators on Specialized Hardware Based
on Constraint Programming",
journal = j-TACO,
volume = "19",
number = "1",
pages = "7:1--7:26",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3487922",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3487922",
abstract = "The success of Deep Artificial Neural Networks (DNNs)
in many domains created a rich body of research
concerned with hardware accelerators for
compute-intensive DNN operators. However, implementing
such operators efficiently with complex hardware
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lei:2022:SEW,
author = "Mengya Lei and Fan Li and Fang Wang and Dan Feng and
Xiaomin Zou and Renzhi Xiao",
title = "{SecNVM}: an Efficient and Write-Friendly Metadata
Crash Consistency Scheme for Secure {NVM}",
journal = j-TACO,
volume = "19",
number = "1",
pages = "8:1--8:26",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3488724",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3488724",
abstract = "Data security is an indispensable part of non-volatile
memory (NVM) systems. However, implementing data
security efficiently on NVM is challenging, since we
have to guarantee the consistency of user data and the
related security metadata. Existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Di:2022:TPM,
author = "Bang Di and Daokun Hu and Zhen Xie and Jianhua Sun and
Hao Chen and Jinkui Ren and Dong Li",
title = "{TLB}-pilot: Mitigating {TLB} Contention Attack on
{GPUs} with Microarchitecture-Aware Scheduling",
journal = j-TACO,
volume = "19",
number = "1",
pages = "9:1--9:23",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491218",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3491218",
abstract = "Co-running GPU kernels on a single GPU can provide
high system throughput and improve hardware
utilization, but this raises concerns on application
security. We reveal that translation lookaside buffer
(TLB) attack, one of the common attacks on CPU, can
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Saileshwar:2022:HLC,
author = "Gururaj Saileshwar and Rick Boivie and Tong Chen and
Benjamin Segal and Alper Buyuktosunoglu",
title = "{HeapCheck}: Low-cost Hardware Support for Memory
Safety",
journal = j-TACO,
volume = "19",
number = "1",
pages = "10:1--10:24",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3495152",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3495152",
abstract = "Programs written in C/C++ are vulnerable to
memory-safety errors like buffer-overflows and
use-after-free. While several mechanisms to detect such
errors have been previously proposed, they suffer from
a variety of drawbacks, including poor performance,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Azhar:2022:TRR,
author = "M. Waqar Azhar and Miquel Peric{\`a}s and Per
Stenstr{\"o}m",
title = "{Task-RM}: a Resource Manager for Energy Reduction in
Task-Parallel Applications under Quality of Service
Constraints",
journal = j-TACO,
volume = "19",
number = "1",
pages = "11:1--11:26",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3494537",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3494537",
abstract = "Improving energy efficiency is an important goal of
computer system design. This article focuses on a
general model of task-parallel applications under
quality-of-service requirements on the completion time.
Our technique, called Task-RM, exploits the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gomes:2022:CCA,
author = "Cesar Gomes and Maziar Amiraski and Mark Hempstead",
title = "{CASHT}: Contention Analysis in Shared Hierarchies
with Thefts",
journal = j-TACO,
volume = "19",
number = "1",
pages = "12:1--12:27",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3494538",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3494538",
abstract = "Cache management policies should consider workloads'
contention behavior when managing a shared cache. Prior
art makes estimates about shared cache behavior by
adding extra logic or time to isolate per workload
cache statistics. These approaches provide \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2022:OSS,
author = "Yufei Wang and Xiaoshe Dong and Longxiang Wang and
Weiduo Chen and Xingjun Zhang",
title = "Optimizing Small-Sample Disk Fault Detection Based on
{LSTM-GAN} Model",
journal = j-TACO,
volume = "19",
number = "1",
pages = "13:1--13:24",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3500917",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3500917",
abstract = "In recent years, researches on disk fault detection
based on SMART data combined with different machine
learning algorithms have been proven to be effective.
However, these methods require a large amount of data.
In the early stages of the establishment \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Silfa:2022:BEE,
author = "Franyell Silfa and Jose Maria Arnau and Antonio
Gonz{\'a}lez",
title = "{E-BATCH}: Energy-Efficient and High-Throughput {RNN}
Batching",
journal = j-TACO,
volume = "19",
number = "1",
pages = "14:1--14:23",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3499757",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3499757",
abstract = "Recurrent Neural Network (RNN) inference exhibits low
hardware utilization due to the strict data
dependencies across time-steps. Batching multiple
requests can increase throughput. However, RNN batching
requires a large amount of padding since the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ding:2022:CCA,
author = "Chen Ding and Dong Chen and Fangzhou Liu and Benjamin
Reber and Wesley Smith",
title = "{CARL}: Compiler Assigned Reference Leasing",
journal = j-TACO,
volume = "19",
number = "1",
pages = "15:1--15:28",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3498730",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 18 06:51:06 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3498730",
abstract = "Data movement is a common performance bottleneck, and
its chief remedy is caching. Traditional cache
management is transparent to the workload: data that
should be kept in cache are determined by the recency
information only, while the program information,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Schlaak:2022:MAF,
author = "Christof Schlaak and Tzung-Han Juang and Christophe
Dubach",
title = "Memory-Aware Functional {IR} for Higher-Level
Synthesis of Accelerators",
journal = j-TACO,
volume = "19",
number = "2",
pages = "16:1--16:26",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501768",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3501768",
abstract = "Specialized accelerators deliver orders of a magnitude
of higher performance than general-purpose processors.
The ever-changing nature of modern workloads is pushing
the adoption of Field Programmable Gate Arrays (FPGAs)
as the substrate of choice. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lakshminarasimhan:2022:FSC,
author = "Kartik Lakshminarasimhan and Ajeya Naithani and
Josu{\'e} Feliu and Lieven Eeckhout",
title = "The Forward Slice Core: a High-Performance, Yet
Low-Complexity Microarchitecture",
journal = j-TACO,
volume = "19",
number = "2",
pages = "17:1--17:25",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3499424",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3499424",
abstract = "Superscalar out-of-order cores deliver high
performance at the cost of increased complexity and
power budget. In-order cores, in contrast, are less
complex and have a smaller power budget, but offer low
performance. A processor architecture should ideally
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Srikanthan:2022:MMA,
author = "Sharanyan Srikanthan and Sayak Chakraborti and
Princeton Ferro and Sandhya Dwarkadas",
title = "{MAPPER}: Managing Application Performance via
Parallel Efficiency Regulation *",
journal = j-TACO,
volume = "19",
number = "2",
pages = "18:1--18:26",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501767",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3501767",
abstract = "State-of-the-art systems, whether in servers or
desktops, provide ample computational and storage
resources to allow multiple simultaneously executing
potentially parallel applications. However, performance
tends to be unpredictable, being a function of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Athanasios:2022:LPN,
author = "Tziouvaras Athanasios and Dimitriou Georgios and
Stamoulis Georgios",
title = "Low-power Near-data Instruction Execution Leveraging
Opcode-based Timing Analysis",
journal = j-TACO,
volume = "19",
number = "2",
pages = "19:1--19:26",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3504005",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3504005",
abstract = "Traditional processor architectures utilize an
external DRAM for data storage, while they also operate
under worst-case timing constraints. Such designs are
heavily constrained by the delay costs of the data
transfer between the core pipeline and the DRAM,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jia:2022:GND,
author = "Xingguo Jia and Jin Zhang and Boshi Yu and Xingyue
Qian and Zhengwei Qi and Haibing Guan",
title = "{GiantVM}: a Novel Distributed Hypervisor for Resource
Aggregation with {DSM-aware} Optimizations",
journal = j-TACO,
volume = "19",
number = "2",
pages = "20:1--20:27",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505251",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3505251",
abstract = "We present GiantVM, an open-source distributed
hypervisor that provides the many-to-one virtualization
to aggregate resources from multiple physical machines.
We propose techniques to enable distributed CPU and I/O
virtualization and distributed shared \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nejat:2022:CSM,
author = "Mehrzad Nejat and Madhavan Manivannan and Miquel
Peric{\`a}s and Per Stenstr{\"o}m",
title = "Cooperative Slack Management: Saving Energy of
Multicore Processors by Trading Performance Slack
Between {QoS}-Constrained Applications",
journal = j-TACO,
volume = "19",
number = "2",
pages = "21:1--21:27",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505559",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3505559",
abstract = "Processor resources can be adapted at runtime
according to the dynamic behavior of applications to
reduce the energy consumption of multicore processors
without affecting the Quality-of-Service (QoS). To
achieve this, an online resource management scheme
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pompougnac:2022:WSR,
author = "Hugo Pompougnac and Ulysse Beaugnon and Albert Cohen
and Dumitru Potop Butucaru",
title = "Weaving Synchronous Reactions into the Fabric of
{SSA}-form Compilers",
journal = j-TACO,
volume = "19",
number = "2",
pages = "22:1--22:25",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506706",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3506706",
abstract = "We investigate the programming of reactive systems
combining closed-loop control with
performance-intensive components such as Machine
Learning (ML). Reactive control systems are often
safety-critical and associated with real-time execution
requirements, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shobaki:2022:RPA,
author = "Ghassan Shobaki and Vahl Scott Gordon and Paul McHugh
and Theodore Dubois and Austin Kerbow",
title = "Register-Pressure-Aware Instruction Scheduling Using
Ant Colony Optimization",
journal = j-TACO,
volume = "19",
number = "2",
pages = "23:1--23:23",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505558",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3505558",
abstract = "This paper describes a new approach to
register-pressure-aware instruction scheduling, using
Ant Colony Optimization (ACO). ACO is a nature-inspired
optimization technique that researchers have
successfully applied to NP-hard sequencing problems
like the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2022:MOG,
author = "Qihan Wang and Zhen Peng and Bin Ren and Jie Chen and
Robert G. Edwards",
title = "{MemHC}: an Optimized {GPU} Memory Management
Framework for Accelerating Many-body Correlation",
journal = j-TACO,
volume = "19",
number = "2",
pages = "24:1--24:26",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506705",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3506705",
abstract = "The many-body correlation function is a fundamental
computation kernel in modern physics computing
applications, e.g., Hadron Contractions in Lattice
quantum chromodynamics (QCD). This kernel is both
computation and memory intensive, involving a series of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kumar:2022:DAS,
author = "Rakesh Kumar and Mehdi Alipour and David
Black-Schaffer",
title = "Dependence-aware Slice Execution to Boost {MLP} in
Slice-out-of-order Cores",
journal = j-TACO,
volume = "19",
number = "2",
pages = "25:1--25:28",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506704",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3506704",
abstract = "Exploiting memory-level parallelism (MLP) is crucial
to hide long memory and last-level cache access
latencies. While out-of-order (OoO) cores, and
techniques building on them, are effective at
exploiting MLP, they deliver poor energy efficiency due
to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vijaykumar:2022:MPO,
author = "Nandita Vijaykumar and Ataberk Olgun and Konstantinos
Kanellopoulos and F. Nisa Bostanci and Hasan Hassan and
Mehrshad Lotfi and Phillip B. Gibbons and Onur Mutlu",
title = "\pkg{MetaSys}: a Practical Open-source Metadata
Management System to Implement and Evaluate Cross-layer
Optimizations",
journal = j-TACO,
volume = "19",
number = "2",
pages = "26:1--26:29",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505250",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3505250",
abstract = "This article introduces the first open-source
FPGA-based infrastructure, MetaSys, with a prototype in
a RISC-V system, to enable the rapid implementation and
evaluation of a wide range of cross-layer techniques in
real hardware. Hardware-software \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2022:EEE,
author = "Jing Chen and Madhavan Manivannan and Mustafa
Abduljabbar and Miquel Peric{\`a}s",
title = "\pkg{ERASE}: Energy Efficient Task Mapping and
Resource Management for Work Stealing Runtimes",
journal = j-TACO,
volume = "19",
number = "2",
pages = "27:1--27:29",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3510422",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3510422",
abstract = "Parallel applications often rely on work stealing
schedulers in combination with fine-grained tasking to
achieve high performance and scalability. However,
reducing the total energy consumption in the context of
work stealing runtimes is still challenging,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ye:2022:PAU,
author = "Chencheng Ye and Yuanchao Xu and Xipeng Shen and Hai
Jin and Xiaofei Liao and Yan Solihin",
title = "Preserving Addressability Upon {GC}-Triggered Data
Movements on Non-Volatile Memory",
journal = j-TACO,
volume = "19",
number = "2",
pages = "28:1--28:26",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3511706",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3511706",
abstract = "This article points out an important threat that
application-level Garbage Collection (GC) creates to
the use of non-volatile memory (NVM). Data movements
incurred by GC may invalidate the pointers to objects
on NVM and, hence, harm the reusability of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Michelogiannakis:2022:CIR,
author = "George Michelogiannakis and Benjamin Klenk and Brandon
Cook and Min Yee Teh and Madeleine Glick and Larry
Dennison and Keren Bergman and John Shalf",
title = "A Case For Intra-rack Resource Disaggregation in
{HPC}",
journal = j-TACO,
volume = "19",
number = "2",
pages = "29:1--29:26",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514245",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 25 07:03:00 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3514245",
abstract = "The expected halt of traditional technology scaling is
motivating increased heterogeneity in high-performance
computing (HPC) systems with the emergence of numerous
specialized accelerators. As heterogeneity increases,
so does the risk of underutilizing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2022:SMS,
author = "Ping Wang and Fei Wen and Paul V. Gratz and Alex
Sprintson",
title = "{SIMD-Matcher}: a {SIMD}-based Arbitrary Matching
Framework",
journal = j-TACO,
volume = "19",
number = "3",
pages = "30:1--30:20",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514246",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3514246",
abstract = "Packet classification methods rely upon matching
packet content/header against pre-defined rules, which
are generated by network applications and their
configurations. With the rapid development of network
technology and the fast-growing network \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mettler:2022:FBA,
author = "Marcel Mettler and Martin Rapp and Heba Khdr and
Daniel Mueller-Gritschneder and J{\"o}rg Henkel and Ulf
Schlichtmann",
title = "An {FPGA}-based Approach to Evaluate Thermal and
Resource Management Strategies of Many-core
Processors",
journal = j-TACO,
volume = "19",
number = "3",
pages = "31:1--31:24",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3516825",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3516825",
abstract = "The continuous technology scaling of integrated
circuits results in increasingly higher power densities
and operating temperatures. Hence, modern many-core
processors require sophisticated thermal and resource
management strategies to mitigate these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mpeis:2022:OIC,
author = "Paschalis Mpeis and Pavlos Petoumenos and Kim
Hazelwood and Hugh Leather",
title = "Object Intersection Captures on Interactive Apps to
Drive a Crowd-sourced Replay-based Compiler
Optimization",
journal = j-TACO,
volume = "19",
number = "3",
pages = "32:1--32:25",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517338",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3517338",
abstract = "Traditional offline optimization frameworks rely on
representative hardware, software, and inputs to
compare different optimizations on. With
application-specific optimization for mobile systems
though, the idea of a representative testbench is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2022:MRM,
author = "Cunlu Li and Dezun Dong and Xiangke Liao",
title = "{MUA-Router}: Maximizing the Utility-of-Allocation for
On-chip Pipelining Routers",
journal = j-TACO,
volume = "19",
number = "3",
pages = "33:1--33:23",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3519027",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3519027",
abstract = "As an important pipeline stage in the router of
Network-on-Chips, switch allocation assigns output
ports to input ports and allows flits to transit
through the switch without conflicts. Previous work
designed efficient switch allocation strategies by
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Choudhury:2022:FOC,
author = "Ziaul Choudhury and Shashwat Shrivastava and Lavanya
Ramapantulu and Suresh Purini",
title = "An {FPGA} Overlay for {CNN} Inference with
Fine-grained Flexible Parallelism",
journal = j-TACO,
volume = "19",
number = "3",
pages = "34:1--34:26",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3519598",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3519598",
abstract = "Increasingly, pre-trained convolutional neural
networks (CNNs) are being deployed for inference in
various computer vision applications, both on the
server-side in the data centers and at the edge. CNN
inference is a very compute-intensive task. It is a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Moolchandani:2022:PPP,
author = "Diksha Moolchandani and Anshul Kumar and Smruti R.
Sarangi",
title = "Performance and Power Prediction for Concurrent
Execution on {GPUs}",
journal = j-TACO,
volume = "19",
number = "3",
pages = "35:1--35:27",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3522712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3522712",
abstract = "The unprecedented growth of edge computing and 5G has
led to an increased offloading of mobile applications
to cloud servers or edge cloudlets. The most prominent
workloads comprise computer vision applications.
Conventional wisdom suggests that computer \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jahanshahi:2022:PQA,
author = "Ali Jahanshahi and Nanpeng Yu and Daniel Wong",
title = "{PowerMorph}: {QoS}-Aware Server Power Reshaping for
Data Center Regulation Service",
journal = j-TACO,
volume = "19",
number = "3",
pages = "36:1--36:27",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524129",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3524129",
abstract = "Adoption of renewable energy in power grids introduces
stability challenges in regulating the operation
frequency of the electricity grid. Thus, electrical
grid operators call for provisioning of frequency
regulation services from end-user customers, such
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2022:BFE,
author = "Peng Xu and Nannan Zhao and Jiguang Wan and Wei Liu
and Shuning Chen and Yuanhui Zhou and Hadeel Albahar
and Hanyang Liu and Liu Tang and Zhihu Tan",
title = "Building a Fast and Efficient {LSM}-tree Store by
Integrating Local Storage with Cloud Storage",
journal = j-TACO,
volume = "19",
number = "3",
pages = "37:1--37:26",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527452",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3527452",
abstract = "The explosive growth of modern web-scale applications
has made cost-effectiveness a primary design goal for
their underlying databases. As a backbone of modern
databases, LSM-tree based key-value stores (LSM store)
face limited storage options. They are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2022:AVC,
author = "Horng-Ruey Huang and Ding-Yong Hong and Jan-Jan Wu and
Kung-Fu Chen and Pangfeng Liu and Wei-Chung Hsu",
title = "Accelerating Video Captioning on Heterogeneous System
Architectures",
journal = j-TACO,
volume = "19",
number = "3",
pages = "38:1--38:25",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527609",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3527609",
abstract = "Video captioning is a core technology to many
important applications, such as AI-assisted medical
diagnosis, video question answering, storytelling
through videos, and lip-reading. Video captioning
employs a hybrid CNN + RNN model. Accelerating such a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Corbalan-Navarro:2022:TDO,
author = "David Corbal{\'a}n-Navarro and Juan L. Arag{\'o}n and
Mart{\'\i} Anglada and Joan-Manuel Parcerisa and
Antonio Gonz{\'a}lez",
title = "Triangle Dropping: an Occluded-geometry Predictor for
Energy-efficient Mobile {GPUs}",
journal = j-TACO,
volume = "19",
number = "3",
pages = "39:1--39:20",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527861",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3527861",
abstract = "This article proposes a novel micro-architecture
approach for mobile GPUs aimed at early removing the
occluded geometry in a scene by leveraging
frame-to-frame coherence, thus reducing the overall
energy consumption. Mobile GPUs commonly implement a
Tile-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kundan:2022:PAP,
author = "Shivam Kundan and Theodoros Marinakis and Iraklis
Anagnostopoulos and Dimitri Kagaris",
title = "A Pressure-Aware Policy for Contention Minimization on
Multicore Systems",
journal = j-TACO,
volume = "19",
number = "3",
pages = "40:1--40:26",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524616",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3524616",
abstract = "Modern Chip Multiprocessors (CMPs) are integrating an
increasing amount of cores to address the continually
growing demand for high-application performance. The
cores of a CMP share several components of the memory
hierarchy, such as Last-Level Cache (LLC). \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Alsop:2022:CFG,
author = "Johnathan Alsop and Weon Taek Na and Matthew D.
Sinclair and Samuel Grayson and Sarita Adve",
title = "A Case for Fine-grain Coherence Specialization in
Heterogeneous Systems",
journal = j-TACO,
volume = "19",
number = "3",
pages = "41:1--41:26",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3530819",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3530819",
abstract = "Hardware specialization is becoming a key enabler of
energy-efficient performance. Future systems will be
increasingly heterogeneous, integrating multiple
specialized and programmable accelerators, each with
different memory demands. Traditionally, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Soltaniyeh:2022:ASC,
author = "Mohammadreza Soltaniyeh and Richard P. Martin and
Santosh Nagarakatte",
title = "An Accelerator for Sparse Convolutional Neural
Networks Leveraging Systolic General Matrix--matrix
Multiplication",
journal = j-TACO,
volume = "19",
number = "3",
pages = "42:1--42:26",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532863",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3532863",
abstract = "This article proposes a novel hardware accelerator for
the inference task with sparse convolutional neural
networks (CNNs) by building a hardware unit to perform
Image to Column (Im2Col) transformation of the input
feature map coupled with a systolic-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dang:2022:LAP,
author = "Dharanidhar Dang and Bill Lin and Debashis Sahoo",
title = "{LiteCON}: an All-photonic Neuromorphic Accelerator
for Energy-efficient Deep Learning",
journal = j-TACO,
volume = "19",
number = "3",
pages = "43:1--43:22",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531226",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3531226",
abstract = "Deep learning is highly pervasive in today's
data-intensive era. In particular, convolutional neural
networks (CNNs) are being widely adopted in a variety
of fields for superior accuracy. However, computing
deep CNNs on traditional CPUs and GPUs brings
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Siddhu:2022:CII,
author = "Lokesh Siddhu and Rajesh Kedia and Shailja Pandey and
Martin Rapp and Anuj Pathania and J{\"o}rg Henkel and
Preeti Ranjan Panda",
title = "{CoMeT}: an Integrated Interval Thermal Simulation
Toolchain for {$2$D}, {2.5D}, and {$3$D}
Processor-Memory Systems",
journal = j-TACO,
volume = "19",
number = "3",
pages = "44:1--44:25",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532185",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3532185",
abstract = "Processing cores and the accompanying main memory
working in tandem enable modern processors. Dissipating
heat produced from computation remains a significant
problem for processors. Therefore, the thermal
management of processors continues to be an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Olson:2022:OAG,
author = "M. Ben Olson and Brandon Kammerdiener and Michael R.
Jantz and Kshitij A. Doshi and Terry Jones",
title = "Online Application Guidance for Heterogeneous Memory
Systems",
journal = j-TACO,
volume = "19",
number = "3",
pages = "45:1--45:27",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3533855",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3533855",
abstract = "As scaling of conventional memory devices has stalled,
many high-end computing systems have begun to
incorporate alternative memory technologies to meet
performance goals. Since these technologies present
distinct advantages and tradeoffs compared to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Honorio:2022:UBE,
author = "Bruno {Chinelato Honorio} and Jo{\~a}o P. L. {De
Carvalho} and Catalina {Munoz Morales} and Alexandro
Baldassin and Guido Araujo",
title = "Using Barrier Elision to Improve Transactional Code
Generation",
journal = j-TACO,
volume = "19",
number = "3",
pages = "46:1--46:23",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3533318",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Sep 2 10:07:01 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3533318",
abstract = "With chip manufacturers such as Intel, IBM, and ARM
offering native support for transactional memory in
their instruction set architectures, memory
transactions are on the verge of being considered a
genuine application tool rather than just an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2022:AOM,
author = "Jiansong Li and Xueying Wang and Xiaobing Chen and
Guangli Li and Xiao Dong and Peng Zhao and Xianzhi Yu
and Yongxin Yang and Wei Cao and Lei Liu and Xiaobing
Feng",
title = "An Application-oblivious Memory Scheduling System for
{DNN} Accelerators",
journal = j-TACO,
volume = "19",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3535355",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3535355",
abstract = "Deep Neural Networks (DNNs) tend to go deeper and
wider, which poses a significant challenge to the
training of DNNs, due to the limited memory capacity of
DNN \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Narayan:2022:AOC,
author = "Aditya Narayan and Yvain Thonnart and Pascal Vivet and
Ayse Coskun and Ajay Joshi",
title = "Architecting Optically Controlled Phase Change
Memory",
journal = j-TACO,
volume = "19",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3533252",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3533252",
abstract = "Phase Change Memory (PCM) is an attractive candidate
for main memory, as it offers non-volatility and zero
leakage power while providing higher cell densities,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2022:AAS,
author = "Chao Zhang and Maximilian Bremer and Cy Chan and John
Shalf and Xiaochen Guo",
title = "{ASA}: Accelerating Sparse Accumulation in Column-wise
{SpGEMM}",
journal = j-TACO,
volume = "19",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3543068",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3543068",
abstract = "Sparse linear algebra is an important kernel in many
different applications. Among various sparse general
matrix-matrix multiplication (SpGEMM) algorithms,
Gustavson's column-wise SpGEMM has good locality when
reading input matrix and can be easily parallelized by
distributing the computation of different columns of an
output matrix to different processors. However, the
sparse accumulation (SPA) step in column-wise SpGEMM,
which merges partial sums from each of the
multiplications by the row indices, is still a
performance bottleneck. The state-of-the-art software
implementation uses a hash table for partial sum search
in the SPA, which makes SPA the largest contributor to
the execution time of SpGEMM. There are three reasons
that cause the SPA to become the bottleneck: (1) hash
probing requires data-dependent branches that are
difficult for a branch predictor to predict correctly;
(2) the accumulation of partial sum is dependent on the
results of the hash probing, which makes it difficult
to hide the hash probing latency; and (3) hash
collision requires time-consuming linear search and
optimizations to reduce these collisions require an
accurate estimation of the number of non-zeros in each
column of the output matrix.
This work proposes ASA architecture to accelerate the
SPA. ASA overcomes the challenges of SPA by (1)
executing the partial sum search and accumulate with a
single instruction through ISA extension to eliminate
data-dependent branches in hash probing, (2) using a
dedicated on-chip cache to perform the search and
accumulation in a pipelined fashion, (3) relying on the
parallel search capability of a set-associative cache
to reduce search latency, and (4) delaying the merging
of overflowed entries. As a result, ASA achieves an
average of 2.25$ \times $ and 5.05$ \times $ speedup as
compared to the state-of-the-art software
implementation of a Markov clustering application and
its SpGEMM kernel, respectively. As compared to a
state-of-the-art hashing accelerator design, ASA
achieves an average of 1.95$ \times $ speedup in the
SpGEMM kernel.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bik:2022:CSS,
author = "Aart Bik and Penporn Koanantakool and Tatiana
Shpeisman and Nicolas Vasilache and Bixia Zheng and
Fredrik Kjolstad",
title = "Compiler Support for Sparse Tensor Computations in
{MLIR}",
journal = j-TACO,
volume = "19",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3544559",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3544559",
abstract = "Sparse tensors arise in problems in science,
engineering, machine learning, and data analytics.
Programs that operate on such tensors can exploit
sparsity to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Michaud:2022:HHA,
author = "Pierre Michaud and Anis Peysieux",
title = "{HAIR}: Halving the Area of the Integer Register File
with Odd\slash Even Banking",
journal = j-TACO,
volume = "19",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3544838",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3544838",
abstract = "This article proposes a new microarchitectural scheme
for reducing the hardware complexity of the integer
register file of a superscalar processor. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yousefzadeh:2022:EEM,
author = "Amirreza Yousefzadeh and Jan Stuijt and Martijn Hijdra
and Hsiao-Hsuan Liu and Anteneh Gebregiorgis and
Abhairaj Singh and Said Hamdioui and Francky Catthoor",
title = "Energy-efficient In-Memory Address Calculation",
journal = j-TACO,
volume = "19",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546071",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3546071",
abstract = "Computation-in-Memory (CIM) is an emerging computing
paradigm to address memory bottleneck challenges in
computer architecture. A CIM unit cannot \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{So:2022:EES,
author = "Hwisoo So and Moslem Didehban and Yohan Ko and Aviral
Shrivastava and Kyoungwoo Lee",
title = "{EXPERTISE}: an Effective Software-level Redundant
Multithreading Scheme against Hardware Faults",
journal = j-TACO,
volume = "19",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546073",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3546073",
abstract = "Error resilience is the primary design concern for
safety- and mission-critical applications. Redundant
MultiThreading (RMT) is one of the most promising soft
and hard \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hartley:2022:JTC,
author = "Tim Hartley and Foivos S. Zakkak and Andy Nisbet and
Christos Kotselidis and Mikel Luj{\'a}n",
title = "Just-In-Time Compilation on {ARM} --- a Closer Look at
Call-Site Code Consistency",
journal = j-TACO,
volume = "19",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546568",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3546568",
abstract = "The increase in computational capability of low-power
Arm architectures has seen them diversify from their
more traditional domain of portable battery powered
devices into data center servers, personal computers,
and even Supercomputers. Thus, managed languages (Java,
Javascript, etc.) that require a managed runtime
environment (MRE) need to be ported to the Arm
architecture, requiring an understanding of different
design tradeoffs.
This article studies how the lack of strong hardware
support for Self Modifying Code (SMC) in low-power
architectures (e.g., absence of cache coherence between
instruction cache and data caches), affects
Just-In-Time (JIT) compilation and runtime behavior in
MREs. Specifically, we focus on the implementation and
treatment of call-sites, that must maintain code
consistency in the face of concurrent execution and
modification to redirect control (patching) by the MRE.
The lack of coherence, is compounded with the maximum
distance (reach of) a call-site can jump to as the
reach is more constrained (smaller distance) in Arm
when compared with Intel/AMD. We present four different
robust implementations for call-sites and discuss their
advantages and disadvantages in the absence of strong
hardware support for SMC. Finally, we evaluate each
approach using a microbenchmark, further evaluating the
best three techniques using three JVM benchmark suites
and the open source MaxineVM showcasing performance
differences up to 12\%. Based on these observations, we
propose extending code-cache partitioning strategies
for JIT compiled code to encourage more efficient local
branching for architectures with limited direct branch
ranges.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jellum:2022:SSA,
author = "Erling Jellum and Milica Orlandi{\'c} and Edmund
Brekke and Tor Johansen and Torleiv Bryne",
title = "Solving Sparse Assignment Problems on {FPGAs}",
journal = j-TACO,
volume = "19",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546072",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3546072",
abstract = "The assignment problem is a fundamental optimization
problem and a crucial part of many systems. For
example, in multiple object tracking, the assignment
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2022:PEP,
author = "Yuhao Li and Benjamin C. Lee",
title = "{Phronesis}: Efficient Performance Modeling for
High-dimensional Configuration Tuning",
journal = j-TACO,
volume = "19",
number = "4",
pages = "56:1--56:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546868",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3546868",
abstract = "We present Phronesis, a learning framework for
efficiently modeling the performance of data analytic
workloads as a function of their high-dimensional
software \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tirumalasetty:2022:RMP,
author = "Chandrahas Tirumalasetty and Chih Chieh Chou and
Narasimha Reddy and Paul Gratz and Ayman Abouelwafa",
title = "Reducing Minor Page Fault Overheads through Enhanced
Page Walker",
journal = j-TACO,
volume = "19",
number = "4",
pages = "57:1--57:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3547142",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3547142",
abstract = "Application virtual memory footprints are growing
rapidly in all systems from servers down to
smartphones. To address this growing demand, system
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gao:2022:ACM,
author = "Lan Gao and Jing Wang and Weigong Zhang",
title = "Adaptive Contention Management for Fine-Grained
Synchronization on Commodity {GPUs}",
journal = j-TACO,
volume = "19",
number = "4",
pages = "58:1--58:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3547301",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3547301",
abstract = "As more emerging applications are moving to GPUs,
fine-grained synchronization has become imperative.
However, their performance can be severely \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Han:2022:CEC,
author = "Ruobing Han and Jaewon Lee and Jaewoong Sim and
Hyesoon Kim",
title = "{COX} : Exposing {CUDA} Warp-level Functions to
{CPUs}",
journal = j-TACO,
volume = "19",
number = "4",
pages = "59:1--59:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3554736",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3554736",
abstract = "As CUDA becomes the de facto programming language
among data parallel applications such as
high-performance computing or machine learning
applications, running CUDA on other platforms becomes a
compelling option. Although several efforts have
attempted to support CUDA on devices other than NVIDIA
GPUs, due to extra steps in the translation, the
support is always a few years behind CUDA's latest
features. In particular, the new CUDA programming model
exposes the warp concept in the programming language,
which greatly changes the way the CUDA code should be
mapped to CPU programs. In this article, hierarchical
collapsing that correctly supports CUDA warp-level
functions on CPUs is proposed. To verify hierarchical
collapsing, we build a framework, COX, that supports
executing CUDA source code on the CPU backend. With
hierarchical collapsing, 90\% of kernels in CUDA SDK
samples can be executed on CPUs, much higher than
previous works (68\%). We also evaluate the performance
with benchmarks for real applications and show that
hierarchical collapsing can generate CPU programs with
comparable or even higher performance than previous
projects in general.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2022:DAS,
author = "Yiding Liu and Xingyao Zhang and Donglin Zhuang and
Xin Fu and Shuaiwen Song",
title = "{DynamAP}: Architectural Support for Dynamic Graph
Traversal on the Automata Processor",
journal = j-TACO,
volume = "19",
number = "4",
pages = "60:1--60:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3556976",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3556976",
abstract = "Dynamic graph traversals (DGTs) currently are widely
used in many important application domains, especially
in this big-data era that urgently demands \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zou:2022:PSB,
author = "Changwei Zou and Yaoqing Gao and Jingling Xue",
title = "Practical Software-Based Shadow Stacks on x86-64",
journal = j-TACO,
volume = "19",
number = "4",
pages = "61:1--61:??",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3556977",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 8 06:39:05 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3556977",
abstract = "Control-Flow Integrity (CFI) techniques focus often on
protecting forward edges and assume that backward edges
are protected by shadow stacks. However, software-based
shadow stacks that can provide performance, security,
and compatibility are still hard to obtain, leaving an
important security gap on x86-64. In this article, we
introduce a simple, efficient, and effective parallel
shadow stack design (based on LLVM), FlashStack, for
protecting return addresses in single- and
multi-threaded programs running under 64-bit Linux on
x86-64, with three distinctive features. First, we
introduce a novel dual-prologue approach to enable a
protected function to thwart the TOCTTOU attacks, which
are constructed by Microsoft's red team and lead to the
deprecation of Microsoft's RFG. Second, we design a new
mapping mechanism, Segment+Rsp-S, to allow the parallel
shadow stack to be accessed efficiently while
satisfying the constraints of arch\_prctl() and ASLR in
64-bit Linux. Finally, we introduce a lightweight
inspection mechanism, SideChannel-K, to harden
FlashStack further by detecting entropy-reduction
attacks efficiently and protecting the parallel shadow
stack effectively with a 10-ms shuffling policy. Our
evaluation on SPEC CPU2006, Nginx, and Firefox shows
that FlashStack can provide high performance,
meaningful security, and reasonable compatibility for
server- and client-side programs on x86-64.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luinaud:2023:SAD,
author = "Thomas Luinaud and J. M. Pierre Langlois and Yvon
Savaria",
title = "Symbolic Analysis for Data Plane Programs
Specialization",
journal = j-TACO,
volume = "20",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3557727",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3557727",
abstract = "Programmable network data planes have extended the
capabilities of packet processing in network devices by
allowing custom processing pipelines and agnostic
packet processing. While a variety of applications can
be implemented on current programmable data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shah:2023:BSA,
author = "Nilesh Rajendra Shah and Ashitabh Misra and Antoine
Min{\'e} and Rakesh Venkat and Ramakrishna Upadrasta",
title = "{BullsEye}: Scalable and Accurate Approximation
Framework for Cache Miss Calculation",
journal = j-TACO,
volume = "20",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3558003",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3558003",
abstract = "For Affine Control Programs or Static Control Programs
(SCoP), symbolic counting of reuse distances could
induce polynomials for each reuse pair. These
polynomials along with cache capacity constraints lead
to non-affine (semi-algebraic) sets; and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Soni:2023:AC,
author = "Mitali Soni and Asmita Pal and Joshua {San Miguel}",
title = "As-Is Approximate Computing",
journal = j-TACO,
volume = "20",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3559761",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3559761",
abstract = "Although approximate computing promises better
performance for applications allowing marginal errors,
dearth of hardware support and lack of run-time
accuracy guarantees makes it difficult to adopt. We
present As-Is, an Anytime Speculative Interruptible
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shah:2023:TDS,
author = "Parth Shah and Ranjal Gautham Shenoy and Vaidyanathan
Srinivasan and Pradip Bose and Alper Buyuktosunoglu",
title = "{TokenSmart}: Distributed, Scalable Power Management
in the Many-core Era",
journal = j-TACO,
volume = "20",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3559762",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3559762",
abstract = "Centralized power management control systems are
hitting a scalability limit. In particular, enforcing a
power cap in a many-core system in a
performance-friendly manner is quite challenging.
Today's on-chip controller reduces the clock speed of
compute \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2023:LFH,
author = "Zhangyu Chen and Yu Hua and Luochangqi Ding and Bo
Ding and Pengfei Zuo and Xue Liu",
title = "Lock-Free High-performance Hashing for Persistent
Memory via {PM}-aware Holistic Optimization",
journal = j-TACO,
volume = "20",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561651",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3561651",
abstract = "Persistent memory (PM) provides large-scale
non-volatile memory (NVM) with DRAM-comparable
performance. The non-volatility and other unique
characteristics of PM architecture bring new
opportunities and challenges for the efficient storage
system design. For example, some recent
crash-consistent and write-friendly hashing schemes are
proposed to provide fast queries for PM systems.
However, existing PM hashing indexes suffer from the
concurrency bottleneck due to the blocking resizing and
expensive lock-based concurrency control for queries.
Moreover, the lack of PM awareness and systematical
design further increases the query latency. To address
the concurrency bottleneck of lock contention in PM
hashing, we propose clevel hashing, a lock-free
concurrent level hashing scheme that provides
non-blocking resizing via background threads and
lock-free search/insertion/update/deletion using atomic
primitives to enable high concurrency for PM hashing.
By exploiting the PM characteristics, we present a
holistic approach to building clevel hashing for high
throughput and low tail latency via the PM-aware
index/allocator co-design. The proposed volatile
announcement array with a helping mechanism coordinates
lock-free insertions and guarantees a strong
consistency model. Our experiments using real-world
YCSB workloads on Intel Optane DC PMM show that clevel
hashing, respectively, achieves up to 5.7x and 1.6x
higher throughput than state-of-the-art P-CLHT and Dash
while guaranteeing low tail latency, e.g., 1.9x--7.2x
speedup for the p99 latency with the insert-only
workload.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mastoras:2023:DIN,
author = "Aristeidis Mastoras and Sotiris Anagnostidis and
Albert-Jan N. Yzelman",
title = "Design and Implementation for Nonblocking Execution in
{GraphBLAS}: Tradeoffs and Performance",
journal = j-TACO,
volume = "20",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561652",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3561652",
abstract = "GraphBLAS is a recent standard that allows the
expression of graph algorithms in the language of
linear algebra and enables automatic code
parallelization and optimization. GraphBLAS operations
are memory bound and may benefit from data locality
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2023:SSC,
author = "Yemao Xu and Dezun Dong and Dongsheng Wang and Shi Xu
and Enda Yu and Weixia Xu and Xiangke Liao",
title = "{SSD-SGD}: Communication Sparsification for
Distributed Deep Learning Training",
journal = j-TACO,
volume = "20",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563038",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3563038",
abstract = "Intensive communication and synchronization cost for
gradients and parameters is the well-known bottleneck
of distributed deep learning training. Based on the
observations that Synchronous SGD (SSGD) obtains good
convergence accuracy while asynchronous \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Olgun:2023:PHE,
author = "Ataberk Olgun and Juan G{\'o}mez Luna and Konstantinos
Kanellopoulos and Behzad Salami and Hasan Hassan and
Oguz Ergin and Onur Mutlu",
title = "{PiDRAM}: a Holistic End-to-end {FPGA}-based Framework
for Processing-in-{DRAM}",
journal = j-TACO,
volume = "20",
number = "1",
pages = "8:1--8:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3563697",
abstract = "Commodity DRAM-based processing-using-memory (PuM)
techniques that are supported by off-the-shelf DRAM
chips present an opportunity for alleviating the data
movement bottleneck at low cost. However, system
integration of these techniques imposes non-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sakalis:2023:DSS,
author = "Christos Sakalis and Stefanos Kaxiras and Magnus
Sj{\"a}lander",
title = "Delay-on-Squash: Stopping Microarchitectural Replay
Attacks in Their Tracks",
journal = j-TACO,
volume = "20",
number = "1",
pages = "9:1--9:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563695",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3563695",
abstract = "MicroScope and other similar microarchitectural replay
attacks take advantage of the characteristics of
speculative execution to trap the execution of the
victim application in a loop, enabling the attacker to
amplify a side-channel attack by executing it
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liang:2023:QRC,
author = "Yi Liang and Shaokang Zeng and Lei Wang",
title = "Quantifying Resource Contention of Co-located
Workloads with the System-level Entropy",
journal = j-TACO,
volume = "20",
number = "1",
pages = "10:1--10:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563696",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3563696",
abstract = "The workload co-location, such as deploying offline
analysis workloads with online service workloads on the
same node, has become common for modern data centers.
Workload co-location deployment improves data center
resource utilization significantly. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Suyeon:2023:FFF,
author = "Hur Suyeon and Seongmin Na and Dongup Kwon and Kim
Joonsung and Andrew Boutros and Eriko Nurvitadhi and
Jangwoo Kim",
title = "A Fast and Flexible {FPGA-based} Accelerator for
Natural Language Processing Neural Networks",
journal = j-TACO,
volume = "20",
number = "1",
pages = "11:1--11:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3564606",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3564606",
abstract = "Deep neural networks (DNNs) have become key solutions
in the natural language processing (NLP) domain.
However, the existing accelerators customized for their
narrow target models cannot support diverse NLP models.
Therefore, naively running complex NLP \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gondimalla:2023:OOD,
author = "Ashish Gondimalla and Jianqiao Liu and Mithuna
Thottethodi and T. N. Vijaykumar",
title = "{Occam}: Optimal Data Reuse for Convolutional Neural
Networks",
journal = j-TACO,
volume = "20",
number = "1",
pages = "12:1--12:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3566052",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3566052",
abstract = "Convolutional neural networks (CNNs) are emerging as
powerful tools for image processing in important
commercial applications. We focus on the important
problem of improving the latency of image recognition.
While CNNs are highly amenable to prefetching
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Peng:2023:FPS,
author = "Bo Peng and Yaozu Dong and Jianguo Yao and Fengguang
Wu and Haibing Guan",
title = "{FlexHM}: a Practical System for Heterogeneous Memory
with Flexible and Efficient Performance Optimizations",
journal = j-TACO,
volume = "20",
number = "1",
pages = "13:1--13:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3565885",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3565885",
abstract = "With the rapid development of cloud computing,
numerous cloud services, containers, and virtual
machines have been bringing tremendous demands on
high-performance memory resources to modern data
centers. Heterogeneous memory, especially the newly
released \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2023:RRB,
author = "Qiang Zhang and Lei Xu and Baowen Xu",
title = "{RegCPython}: a Register-based {Python} Interpreter
for Better Performance",
journal = j-TACO,
volume = "20",
number = "1",
pages = "14:1--14:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3568973",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3568973",
abstract = "Interpreters are widely used in the implementation of
many programming languages, such as Python, Perl, and
Java. Even though various JIT compilers emerge in an
endless stream, interpretation efficiency still plays a
critical role in program performance. Does a
stack-based interpreter or a register-based interpreter
perform better? The pros and cons of the pair of
architectures have long been discussed. The stack
architecture is attractive for its concise model and
compact bytecode, but our study finds that the
register-based interpreter can also be implemented
easily and that its bytecode size only grows by a small
margin. Moreover, the latter turns out to be
appreciably faster. Specifically, we implemented an
open source Python interpreter named RegCPython based
on CPython v3.10.1. The former is register based, while
the latter is stack based. Without changes in syntax,
Application Programming Interface, and Application
Binary Interface, RegCPython is excellently compatible
with CPython, as it does not break existing syntax or
interfaces. It achieves a speedup of 1.287 on the most
favorable benchmark and 0.977 even on the most
unfavorable benchmark. For all Python-intensive
benchmarks, the average speedup reaches 1.120 on x86
and 1.130 on ARM. Our evaluation work, which also
serves as an empirical study, provides a detailed
performance survey of both interpreters on modern
hardware. It points out that the register-based
interpreters are more efficient mainly due to the
elimination of machine instructions needed, while
changes in branch mispredictions and cache misses have
a limited impact on performance. Additionally, it
confirms that the register-based implementation is also
satisfactory in terms of memory footprint, compilation
cost, and implementation complexity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jin:2023:SBS,
author = "Hai Jin and Zhuo He and Weizhong Qiang",
title = "{SpecTerminator}: Blocking Speculative Side Channels
Based on Instruction Classes on {RISC-V}",
journal = j-TACO,
volume = "20",
number = "1",
pages = "15:1--15:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3566053",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3566053",
abstract = "In modern processors, speculative execution has
significantly improved the performance of processors,
but it has also introduced speculative execution
vulnerabilities. Recent defenses are based on the
delayed execution to block various speculative side
channels, but we show that several of the current
state-of-the-art defenses fail to block some of the
available speculative side channels, and the current
most secure defense introduces a performance overhead
of up to 24.5\%.\par
We propose SpecTerminator, the first defense framework
based on instruction classes that can comprehensively
and precisely block all existing speculative side
channels. In SpecTerminator, a novel speculative side
channel classification scheme based on the features of
secret transmission is proposed, and the sensitive
instructions in the speculative window are classified
and identified using optimized hardware taint tracking
and instruction masking techniques to accurately
determine the scope of leakage. Then, according to the
execution characteristics of these instructions,
dedicated delayed execution strategies, such as TLB
request ignoring, selective issue, and extended
delay-on-miss, are designed for each type of sensitive
instruction to precisely control that these
instructions are delayed only in pipeline stages that
are at risk of leakage. In contrast to previous
defenses based on the Gem5 simulator, we have
innovatively implemented defenses against Spectre
attacks based on the open-source instruction set RISC-V
on an FPGA-accelerated simulation platform that is more
similar to real hardware. To evaluate the security of
SpecTerminator, we have replicated various existing
x86-based Spectre variants on RISC-V. On SPEC 2006,
SpecTerminator defends against Spectre attacks based on
memory hierarchy side channels with a performance
overhead of 2.6\% and against all existing Spectre
attacks with a performance overhead of 6.0\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2023:PSC,
author = "Tuowen Zhao and Tobi Popoola and Mary Hall and
Catherine Olschanowsky and Michelle Strout",
title = "Polyhedral Specification and Code Generation of Sparse
Tensor Contraction with Co-iteration",
journal = j-TACO,
volume = "20",
number = "1",
pages = "16:1--16:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3566054",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3566054",
abstract = "This article presents a code generator for sparse
tensor contraction computations. It leverages a
mathematical representation of loop nest computations
in the sparse polyhedral framework (SPF), which extends
the polyhedral model to support non-affine \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Schuler:2023:XOT,
author = "Manuela Schuler and Richard Membarth and Philipp
Slusallek",
title = "{XEngine}: Optimal Tensor Rematerialization for Neural
Networks in Heterogeneous Environments",
journal = j-TACO,
volume = "20",
number = "1",
pages = "17:1--17:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3568956",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3568956",
abstract = "Memory efficiency is crucial in training deep learning
networks on resource-restricted devices. During
backpropagation, forward tensors are used to calculate
gradients. Despite the option of keeping those
dependencies in memory until they are reused in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Korostelev:2023:YCL,
author = "Ivan Korostelev and Jo{\~a}o P. L. {De Carvalho} and
Jos{\'e} Moreira and Jos{\'e} Nelson Amaral",
title = "{YaConv}: Convolution with Low Cache Footprint",
journal = j-TACO,
volume = "20",
number = "1",
pages = "18:1--18:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570305",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3570305",
abstract = "This article introduces YaConv, a new algorithm to
compute convolution using GEMM microkernels from a
Basic Linear Algebra Subprograms library that is
efficient for multiple CPU architectures. Previous
approaches either create a copy of each image element
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eris:2023:PRF,
author = "Furkan Eris and Marcia Louis and Kubra Eris and
Jos{\'e} Abell{\'a}n and Ajay Joshi",
title = "{Puppeteer}: a Random Forest Based Manager for
Hardware Prefetchers Across the Memory Hierarchy",
journal = j-TACO,
volume = "20",
number = "1",
pages = "19:1--19:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570304",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 17 06:54:21 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3570304",
abstract = "Over the years, processor throughput has steadily
increased. However, the memory throughput has not
increased at the same rate, which has led to the memory
wall problem in turn increasing the gap between
effective and theoretical peak processor \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tollenaere:2023:ACE,
author = "Nicolas Tollenaere and Guillaume Iooss and
St{\'e}phane Pouget and Hugo Brunie and Christophe
Guillon and Albert Cohen and P. Sadayappan and Fabrice
Rastello",
title = "Autotuning Convolutions Is Easier Than You Think",
journal = j-TACO,
volume = "20",
number = "2",
pages = "20:1--20:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570641",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3570641",
abstract = "A wide range of scientific and machine learning
applications depend on highly optimized implementations
of tensor computations. Exploiting the full capacity of
a given processor architecture remains a challenging
task, due to the complexity of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Perez:2023:UDO,
author = "V{\'\i}ctor P{\'e}rez and Lukas Sommer and Victor
Lom{\"u}ller and Kumudha Narasimhan and Mehdi Goli",
title = "User-driven Online Kernel Fusion for {SYCL}",
journal = j-TACO,
volume = "20",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571284",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3571284",
abstract = "Heterogeneous programming models are becoming
increasingly popular to support the ever-evolving
hardware architectures, especially for new and emerging
specialized accelerators optimizing specific tasks.
While such programs provide performance portability
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Espindola:2023:SMR,
author = "Vinicius Espindola and Luciano Zago and Herv{\'e}
Yviquel and Guido Araujo",
title = "Source Matching and Rewriting for {MLIR} Using
String-Based Automata",
journal = j-TACO,
volume = "20",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571283",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3571283",
abstract = "A typical compiler flow relies on a uni-directional
sequence of translation/optimization steps that lower
the program abstract representation, making it hard to
preserve higher-level program information across each
transformation step. On the other hand, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ma:2023:OFM,
author = "Wenjing Ma and Fangfang Liu and Daokun Chen and
Qinglin Lu and Yi Hu and Hongsen Wang and Xinhui Yuan",
title = "An Optimized Framework for Matrix Factorization on the
New {Sunway} Many-core Platform",
journal = j-TACO,
volume = "20",
number = "2",
pages = "23:1--23:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571856",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3571856",
abstract = "Matrix factorization functions are used in many areas
and often play an important role in the overall
performance of the applications. In the LAPACK library,
matrix factorization functions are implemented with
blocked factorization algorithm, shifting \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Singh:2023:HHP,
author = "Sarabjeet Singh and Neelam Surana and Kailash Prasad
and Pranjali Jain and Joycee Mekie and Manu Awasthi",
title = "{HyGain}: High-performance, Energy-efficient Hybrid
Gain Cell-based Cache Hierarchy",
journal = j-TACO,
volume = "20",
number = "2",
pages = "24:1--24:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572839",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3572839",
abstract = "In this article, we propose a ``full-stack'' solution
to designing high-apacity and low-latency on-chip cache
hierarchies by starting at the circuit level of the
hardware design stack. We propose a novel half V
$_{DD}$ precharge 2T Gain Cell (GC) design for the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mummidi:2023:AAC,
author = "Chandra Sekhar Mummidi and Sandip Kundu",
title = "{ACTION}: Adaptive Cache Block Migration in
Distributed Cache Architectures",
journal = j-TACO,
volume = "20",
number = "2",
pages = "25:1--25:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572911",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3572911",
abstract = "Chip multiprocessors (CMP) with more cores have more
traffic to the last-level cache (LLC). Without a
corresponding increase in LLC bandwidth, such traffic
cannot be sustained, resulting in performance
degradation. Previous research focused on data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2023:UBC,
author = "Qiaoyi Liu and Jeff Setter and Dillon Huff and Maxwell
Strange and Kathleen Feng and Mark Horowitz and
Priyanka Raina and Fredrik Kjolstad",
title = "Unified Buffer: Compiling Image Processing and Machine
Learning Applications to Push-Memory Accelerators",
journal = j-TACO,
volume = "20",
number = "2",
pages = "26:1--26:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572908",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3572908",
abstract = "Image processing and machine learning applications
benefit tremendously from hardware acceleration.
Existing compilers target either FPGAs, which sacrifice
power and performance for programmability, or ASICs,
which become obsolete as applications change.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yuzuguler:2023:SSA,
author = "Ahmet Caner Y{\"u}z{\"u}g{\"u}ler and Canberk
S{\"o}nmez and Mario Drumond and Yunho Oh and Babak
Falsafi and Pascal Frossard",
title = "Scale-out Systolic Arrays",
journal = j-TACO,
volume = "20",
number = "2",
pages = "27:1--27:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572917",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3572917",
abstract = "Multi-pod systolic arrays are emerging as the
architecture of choice in DNN inference accelerators.
Despite their potential, designing multi-pod systolic
arrays to maximize effective throughput/Watt-i.e.,
throughput/Watt adjusted when accounting for array
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Minervini:2023:VAE,
author = "Francesco Minervini and Oscar Palomar and Osman Unsal
and Enrico Reggiani and Josue Quiroga and Joan Marimon
and Carlos Rojas and Roger Figueras and Abraham Ruiz
and Alberto Gonzalez and Jonnatan Mendoza and Ivan
Vargas and C{\'e}sar Hernandez and Joan Cabre and Lina
Khoirunisya and Mustapha Bouhali and Julian Pavon and
Francesc Moll and Mauro Olivieri and Mario Kovac and
Mate Kovac and Leon Dragic and Mateo Valero and Adrian
Cristal",
title = "{Vitruvius+}: an Area-Efficient {RISC-V} Decoupled
Vector Coprocessor for High Performance Computing
Applications",
journal = j-TACO,
volume = "20",
number = "2",
pages = "28:1--28:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3575861",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3575861",
abstract = "The maturity level of RISC-V and the availability of
domain-specific instruction set extensions, like vector
processing, make RISC-V a good candidate for supporting
the integration of specialized hardware in processor
cores for the High Performance \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Benmeziane:2023:MOH,
author = "Hadjer Benmeziane and Hamza Ouarnoughi and Kaoutar {El
Maghraoui} and Smail Niar",
title = "Multi-objective Hardware-aware Neural Architecture
Search with {Pareto} Rank-preserving Surrogate Models",
journal = j-TACO,
volume = "20",
number = "2",
pages = "29:1--29:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579853",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3579853",
abstract = "Deep learning (DL) models such as convolutional neural
networks (ConvNets) are being deployed to solve various
computer vision and natural language processing tasks
at the edge. It is a challenge to find the right DL
architecture that simultaneously meets \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2023:FFA,
author = "Dongwei Chen and Dong Tong and Chun Yang and Jiangfang
Yi and Xu Cheng",
title = "{FlexPointer}: Fast Address Translation Based on Range
{TLB} and Tagged Pointers",
journal = j-TACO,
volume = "20",
number = "2",
pages = "30:1--30:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579854",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3579854",
abstract = "Page-based virtual memory relies on TLBs to accelerate
the address translation. Nowadays, the gap between
application workloads and the capacity of TLB continues
to grow, bringing many costly TLB misses and making the
TLB a performance bottleneck. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Du:2023:FOS,
author = "Jingwen Du and Fang Wang and Dan Feng and Changchen
Gan and Yuchao Cao and Xiaomin Zou and Fan Li",
title = "Fast One-Sided {RDMA}-Based State Machine Replication
for Disaggregated Memory",
journal = j-TACO,
volume = "20",
number = "2",
pages = "31:1--31:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587096",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 10 08:08:06 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3587096",
abstract = "Disaggregated memory architecture has risen in
popularity for large datacenters with the advantage of
improved resource utilization, failure isolation, and
elasticity. Replicated state machines (RSMs) have been
extensively used for reliability and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sahni:2023:AAS,
author = "Abdul Rasheed Sahni and Hamza Omar and Usman Ali and
Omer Khan",
title = "{ASM}: an Adaptive Secure Multicore for Co-located
Mutually Distrusting Processes",
journal = j-TACO,
volume = "20",
number = "3",
pages = "32:1--32:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587480",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3587480",
abstract = "With the ever-increasing virtualization of software
and hardware, the privacy of user-sensitive data is a
fundamental concern in computation outsourcing. Secure
processors enable a trusted execution environment to
guarantee security properties based on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Puthoor:2023:TBS,
author = "Sooraj Puthoor and Mikko H. Lipasti",
title = "Turn-based Spatiotemporal Coherence for {GPUs}",
journal = j-TACO,
volume = "20",
number = "3",
pages = "33:1--33:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3593054",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3593054",
abstract = "This article introduces turn-based spatiotemporal
coherence. Spatiotemporal coherence is a novel
coherence implementation that assigns write permission
to epochs (or turns) as opposed to a processor core.
This paradigm shift in the assignment of write
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2023:JOJ,
author = "Ruobing Chen and Haosen Shi and Jinping Wu and Yusen
Li and Xiaoguang Liu and Gang Wang",
title = "Jointly Optimizing Job Assignment and Resource
Partitioning for Improving System Throughput in Cloud
Datacenters",
journal = j-TACO,
volume = "20",
number = "3",
pages = "34:1--34:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3593055",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3593055",
abstract = "Colocating multiple jobs on the same server has been
widely applied for improving resource utilization in
cloud datacenters. However, the colocated jobs would
contend for the shared resources, which could lead to
significant performance degradation. An \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ravi:2023:TMA,
author = "Gokul Subramanian Ravi and Tushar Krishna and Mikko
Lipasti",
title = "{TNT}: a Modular Approach to Traversing Physically
Heterogeneous {NOCs} at Bare-wire Latency",
journal = j-TACO,
volume = "20",
number = "3",
pages = "35:1--35:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597611",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3597611",
abstract = "The ideal latency for on-chip network traversal would
be the delay incurred from wire traversal alone.
Unfortunately, in a realistic modular network, the
latency for a packet to traverse the network is
significantly higher than this wire delay. The main
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2023:ACN,
author = "Weizhi Xu and Yintai Sun and Shengyu Fan and Hui Yu
and Xin Fu",
title = "Accelerating Convolutional Neural Network by
Exploiting Sparsity on {GPUs}",
journal = j-TACO,
volume = "20",
number = "3",
pages = "36:1--36:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3600092",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3600092",
abstract = "The convolutional neural network (CNN) is an important
deep learning method, which is widely used in many
fields. However, it is very time consuming to implement
the CNN where convolution usually takes most of the
time. There are many zero values in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2023:GED,
author = "Jin Zhao and Yu Zhang and Ligang He and Qikun Li and
Xiang Zhang and Xinyu Jiang and Hui Yu and Xiaofei Liao
and Hai Jin and Lin Gu and Haikun Liu and Bingsheng He
and Ji Zhang and Xianzheng Song and Lin Wang and Jun
Zhou",
title = "{GraphTune}: an Efficient Dependency-Aware Substrate
to Alleviate Irregularity in Concurrent Graph
Processing",
journal = j-TACO,
volume = "20",
number = "3",
pages = "37:1--37:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3600091",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3600091",
abstract = "With the increasing need for graph analysis, massive
Concurrent iterative Graph Processing (CGP) jobs are
usually performed on the common large-scale real-world
graph. Although several solutions have been proposed,
these CGP jobs are not coordinated with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2023:IPS,
author = "Yufeng Zhou and Alan L. Cox and Sandhya Dwarkadas and
Xiaowan Dong",
title = "The Impact of Page Size and Microarchitecture on
Instruction Address Translation Overhead",
journal = j-TACO,
volume = "20",
number = "3",
pages = "38:1--38:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3600089",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3600089",
abstract = "As the volume of data processed by applications has
increased, considerable attention has been paid to data
address translation overheads, leading to the
widespread use of larger page sizes (``superpages'')
and multi-level translation lookaside buffers (.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Reber:2023:CPS,
author = "Benjamin Reber and Matthew Gould and Alexander H.
Kneipp and Fangzhou Liu and Ian Prechtl and Chen Ding
and Linlin Chen and Dorin Patru",
title = "Cache Programming for Scientific Loops Using Leases",
journal = j-TACO,
volume = "20",
number = "3",
pages = "39:1--39:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3600090",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3600090",
abstract = "Cache management is important in exploiting locality
and reducing data movement. This article studies a new
type of programmable cache called the lease cache. By
assigning leases, software exerts the primary control
on when and how long data stays in the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xie:2023:MMC,
author = "Xinfeng Xie and Peng Gu and Yufei Ding and Dimin Niu
and Hongzhong Zheng and Yuan Xie",
title = "{MPU}: Memory-centric {SIMT} Processor via In-{DRAM}
Near-bank Computing",
journal = j-TACO,
volume = "20",
number = "3",
pages = "40:1--40:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603113",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3603113",
abstract = "With the growing number of data-intensive workloads,
GPU, which is the state-of-the-art
single-instruction-multiple-thread (SIMT) processor, is
hindered by the memory bandwidth wall. To alleviate
this bottleneck, previously proposed 3D-stacking
near-bank \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Krolik:2023:RFQ,
author = "Alexander Krolik and Clark Verbrugge and Laurie
Hendren",
title = "{rNdN}: Fast Query Compilation for {NVIDIA GPUs}",
journal = j-TACO,
volume = "20",
number = "3",
pages = "41:1--41:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603503",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3603503",
abstract = "GPU database systems are an effective solution to
query optimization, particularly with compilation and
data caching. They fall short, however, in end-to-end
workloads, as existing compiler toolchains are too
expensive for use with short-running queries.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2023:HMP,
author = "Jiazhi Jiang and Zijian Huang and Dan Huang and
Jiangsu Du and Lin Chen and Ziguan Chen and Yutong Lu",
title = "Hierarchical Model Parallelism for Optimizing
Inference on Many-core Processor via Decoupled
{$3$D-CNN} Structure",
journal = j-TACO,
volume = "20",
number = "3",
pages = "42:1--42:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3605149",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3605149",
abstract = "The tremendous success of convolutional neural network
(CNN) has made it ubiquitous in many fields of human
endeavor. Many applications such as biomedical analysis
and scientific data analysis involve analyzing
volumetric data. This spawns huge demand for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2023:MGA,
author = "Yuwen Zhao and Fangfang Liu and Wenjing Ma and Huiyuan
Li and Yuanchi Peng and Cui Wang",
title = "{MFFT}: a {GPU} Accelerated Highly Efficient
Mixed-Precision Large-Scale {FFT} Framework",
journal = j-TACO,
volume = "20",
number = "3",
pages = "43:1--43:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3605148",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3605148",
abstract = "Fast Fourier transform (FFT) is widely used in
computing applications in large-scale parallel
programs, and data communication is the main
performance bottleneck of FFT and seriously affects its
parallel efficiency. To tackle this problem, we propose
a new large-scale FFT framework, MFFT, which optimizes
parallel FFT with a new mixed-precision optimization
technique, adopting the ``high precision computation,
low precision communication'' strategy. To enable ``low
precision communication'', we propose a shared-exponent
floating-point number compression technique, which
reduces the volume of data communication, while
maintaining higher accuracy. In addition, we apply a
two-phase normalization technique to further reduce the
round-off error. Based on the mixed-precision MFFT
framework, we apply several optimization techniques to
improve the performance, such as streaming of GPU
kernels, MPI message combination, kernel optimization,
and memory optimization. We evaluate MFFT on a system
with 4,096 GPUs. The results show that shared-exponent
MFFT is $ 1.23 \times $ faster than that of
double-precision MFFT on average, and double-precision
MFFT achieves performance $ 3.53 \times $ and $ 9.48
\times $ on average higher than open source library
2Decomp\&FFT (CPU-based version) and heFFTe (AMD
GPU-based version), respectively. The parallel
efficiency of double-precision MFFT increased from
53.2\% to 78.1\% compared with 2Decomp\&FFT, and
shared-exponent MFFT further increases the parallel
efficiency to 83.8\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Azhar:2023:ARR,
author = "Muhammad Waqar Azhar and Madhavan Manivannan and Per
Stenstr{\"o}m",
title = "{Approx-RM}: Reducing Energy on Heterogeneous
Multicore Processors under Accuracy and Timing
Constraints",
journal = j-TACO,
volume = "20",
number = "3",
pages = "44:1--44:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3605214",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3605214",
abstract = "Reducing energy consumption while providing
performance and quality guarantees is crucial for
computing systems ranging from battery-powered embedded
systems to data centers. This article considers
approximate iterative applications executing on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2023:STE,
author = "Dong Huang and Dan Feng and Qiankun Liu and Bo Ding
and Wei Zhao and Xueliang Wei and Wei Tong",
title = "{SplitZNS}: Towards an Efficient {LSM}-Tree on Zoned
Namespace {SSDs}",
journal = j-TACO,
volume = "20",
number = "3",
pages = "45:1--45:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3608476",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 10 07:14:56 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3608476",
abstract = "The Zoned Namespace (ZNS) Solid State Drive (SSD) is a
nascent form of storage device that offers novel
prospects for the Log Structured Merge Tree (LSM-tree).
ZNS exposes erase blocks in SSD as append-only zones,
enabling the LSM-tree to gain awareness \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Du:2023:ICM,
author = "Jiangsu Du and Jiazhi Jiang and Jiang Zheng and
Hongbin Zhang and Dan Huang and Yutong Lu",
title = "Improving Computation and Memory Efficiency for
Real-world {Transformer} Inference on {GPUs}",
journal = j-TACO,
volume = "20",
number = "4",
pages = "46:1--46:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617689",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3617689",
abstract = "Transformer models have emerged as a leading approach
in the field of natural language processing (NLP) and
are increasingly being deployed in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jin:2023:CTC,
author = "Hai Jin and Bo Lei and Haikun Liu and Xiaofei Liao and
Zhuohui Duan and Chencheng Ye and Yu Zhang",
title = "A Compilation Tool for Computation Offloading in
{ReRAM}-based {CIM} Architectures",
journal = j-TACO,
volume = "20",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617686",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3617686",
abstract = "Computing-in-Memory (CIM) architectures using
Non-volatile Memories (NVMs) have emerged as a
promising way to address the ``memory wall'' problem in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Menard:2023:HPD,
author = "Christian Menard and Marten Lohstroh and Soroush
Bateni and Matthew Chorlian and Arthur Deng and Peter
Donovan and Cl{\'e}ment Fournier and Shaokai Lin and
Felix Suchert and Tassilo Tanneberger and Hokeun Kim
and Jeronimo Castrillon and Edward A. Lee",
title = "High-performance Deterministic Concurrency Using
{Lingua Franca}",
journal = j-TACO,
volume = "20",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617687",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3617687",
abstract = "Actor frameworks and similar reactive programming
techniques are widely used for building concurrent
systems. They promise to be efficient and scale well to
a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2023:SDM,
author = "Donglei Wu and Weihao Yang and Xiangyu Zou and Wen Xia
and Shiyi Li and Zhenbo Hu and Weizhe Zhang and Binxing
Fang",
title = "{Smart-DNN+}: a Memory-efficient Neural Networks
Compression Framework for the Model Inference",
journal = j-TACO,
volume = "20",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617688",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3617688",
abstract = "Deep Neural Networks (DNNs) have achieved remarkable
success in various real-world applications. However,
running a Deep Neural Network (DNN) typically
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{TACO-294350005,
author = "Syed Salauddin Mohammad Tariq and Lance Menard and
Pengfei Su and Probir Roy",
title = "{MicroProf}: Code-level Attribution of Unnecessary
Data Transfer in Microservice Applications",
journal = j-TACO,
volume = "20",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3622787",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3622787",
abstract = "The microservice architecture style has gained
popularity due to its ability to fault isolation, ease
of scaling applications, and developer's agility.
However, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2023:GGM,
author = "Shiyi Li and Qiang Cao and Shenggang Wan and Wen Xia
and Changsheng Xie",
title = "{gPPM}: a Generalized Matrix Operation and Parallel
Algorithm to Accelerate the Encoding\slash Decoding
Process of Erasure Codes",
journal = j-TACO,
volume = "20",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625005",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3625005",
abstract = "Erasure codes are widely deployed in modern storage
systems, leading to frequent usage of their
encoding/decoding operations. The encoding/decoding
process for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anastasiadis:2023:PPA,
author = "Petros Anastasiadis and Nikela Papadopoulou and
Georgios Goumas and Nectarios Koziris and Dennis Hoppe
and Li Zhong",
title = "{PARALiA}: a Performance Aware Runtime for Auto-tuning
Linear Algebra on Heterogeneous Systems",
journal = j-TACO,
volume = "20",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624569",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3624569",
abstract = "Dense linear algebra operations appear very frequently
in high-performance computing (HPC) applications,
rendering their performance crucial to achieve
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yu:2023:RER,
author = "Hui Yu and Yu Zhang and Jin Zhao and Yujian Liao and
Zhiying Huang and Donghao He and Lin Gu and Hai Jin and
Xiaofei Liao and Haikun Liu and Bingsheng He and
Jianhui Yue",
title = "{RACE}: an Efficient Redundancy-aware Accelerator for
Dynamic Graph Neural Network",
journal = j-TACO,
volume = "20",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617685",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3617685",
abstract = "Dynamic Graph Neural Network (DGNN) has recently
attracted a significant amount of research attention
from various domains, because most real-world graphs
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ferrari:2023:ADC,
author = "Victor Ferrari and Rafael Sousa and Marcio Pereira and
Jo{\~a}o P. L. {De Carvalho} and Jos{\'e} Nelson Amaral
and Jos{\'e} Moreira and Guido Araujo",
title = "Advancing Direct Convolution Using Convolution Slicing
Optimization and {ISA} Extensions",
journal = j-TACO,
volume = "20",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625004",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3625004",
abstract = "Convolution is one of the most computationally
intensive operations that must be performed for machine
learning model inference. A traditional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{He:2023:DLS,
author = "Bowen He and Xiao Zheng and Yuan Chen and Weinan Li
and Yajin Zhou and Xin Long and Pengcheng Zhang and
Xiaowei Lu and Linquan Jiang and Qiang Liu and Dennis
Cai and Xiantao Zhang",
title = "{DxPU}: Large-scale Disaggregated {GPU} Pools in the
Datacenter",
journal = j-TACO,
volume = "20",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617995",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3617995",
abstract = "The rapid adoption of AI and convenience offered by
cloud services have resulted in the growing demands for
GPUs in the cloud. Generally, GPUs are physically
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2023:CMC,
author = "Shiqing Zhang and Mahmood Naderan-Tahan and Magnus
Jahre and Lieven Eeckhout",
title = "Characterizing Multi-Chip {GPU} Data Sharing",
journal = j-TACO,
volume = "20",
number = "4",
pages = "56:1--56:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629521",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3629521",
abstract = "Multi-chip Graphics Processing Unit (GPU) systems are
critical to scale performance beyond a single GPU chip
for a wide variety of important emerging \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Domke:2023:LPQ,
author = "Jens Domke and Emil Vatai and Balazs Gerofi and Yuetsu
Kodama and Mohamed Wahib and Artur Podobas and Sparsh
Mittal and Miquel Peric{\`a}s and Lingqi Zhang and Peng
Chen and Aleksandr Drozd and Satoshi Matsuoka",
title = "At the Locus of Performance: Quantifying the Effects
of Copious {$3$D}-Stacked Cache on {HPC} Workloads",
journal = j-TACO,
volume = "20",
number = "4",
pages = "57:1--57:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629520",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3629520",
abstract = "Over the last three decades, innovations in the memory
subsystem were primarily targeted at overcoming the
data movement bottleneck. In this paper, we focus
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Badri:2023:MPE,
author = "Satya Jaswanth Badri and Mukesh Saini and Neeraj
Goel",
title = "{Mapi-Pro}: an Energy Efficient Memory Mapping
Technique for Intermittent Computing",
journal = j-TACO,
volume = "20",
number = "4",
pages = "58:1--58:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629524",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3629524",
abstract = "Battery-less technology evolved to replace battery
usage in space, deep mines, and other environments to
reduce cost and pollution. Non-volatile memory
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yu:2023:MFE,
author = "Miao Yu and Tingting Xiang and Venkata Pavan Kumar
Miriyala and Trevor E. Carlson",
title = "{Multiply-and-Fire}: an Event-Driven Sparse Neural
Network Accelerator",
journal = j-TACO,
volume = "20",
number = "4",
pages = "59:1--59:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630255",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3630255",
abstract = "Deep neural network inference has become a vital
workload for many systems from edge-based computing to
data centers. To reduce the performance and power
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Choudhury:2023:FAI,
author = "Ziaul Choudhury and Anish Gulati and Suresh Purini",
title = "{FlowPix}: Accelerating Image Processing Pipelines on
an {FPGA} Overlay using a Domain Specific Compiler",
journal = j-TACO,
volume = "20",
number = "4",
pages = "60:1--60:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629523",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3629523",
abstract = "The exponential performance growth guaranteed by
Moore's law has started to taper in recent years. At
the same time, emerging applications like image
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Susskind:2023:UNA,
author = "Zachary Susskind and Aman Arora and Igor D. S. Miranda
and Alan T. L. Bacellar and Luis A. Q. Villon and
Rafael F. Katopodis and Leandro S. de Ara{\'u}jo and
Diego L. C. Dutra and Priscila M. V. Lima and Felipe M.
G. Fran{\c{c}}a and Mauricio {Breternitz Jr.} and Lizy
K. John",
title = "{ULEEN}: a Novel Architecture for Ultra-low-energy
Edge Neural Networks",
journal = j-TACO,
volume = "20",
number = "4",
pages = "61:1--61:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629522",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3629522",
abstract = "``Extreme edge'' devices, such as smart sensors, are a
uniquely challenging environment for the deployment of
machine learning. The tiny energy budgets \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wei:2023:FOT,
author = "Jia Wei and Xingjun Zhang and Longxiang Wang and Zheng
Wei",
title = "{Fastensor}: Optimise the {Tensor} {I/O} Path from
{SSD} to {GPU} for Deep Learning Training",
journal = j-TACO,
volume = "20",
number = "4",
pages = "62:1--62:??",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630108",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Dec 21 10:29:36 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3630108",
abstract = "In recent years, benefiting from the increase in model
size and complexity, deep learning has achieved
tremendous success in computer vision (CV) and (NLP).
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2024:CDB,
author = "Longfei Luo and Dingcui Yu and Yina Lv and Liang Shi",
title = "Critical Data Backup with Hybrid Flash-Based Consumer
Devices",
journal = j-TACO,
volume = "21",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631529",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3631529",
abstract = "Hybrid flash-based storage constructed with
high-density and low-cost flash memory has become
increasingly popular in consumer devices in the last
decade due to its low cost. However, its poor
reliability is one of the major concerns. To protect
critical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2024:DOO,
author = "Peng Chen and Hui Chen and Weichen Liu and Linbo Long
and Wanli Chang and Nan Guan",
title = "{DAG-Order}: an Order-Based Dynamic {DAG} Scheduling
for Real-Time Networks-on-Chip",
journal = j-TACO,
volume = "21",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631527",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3631527",
abstract = "With the high-performance requirement of
safety-critical real-time tasks, the platforms of
many-core processors with high parallelism are widely
utilized, where network-on-chip (NoC) is generally
employed for inter-core communication due to its
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2024:JRG,
author = "Zhang Jiang and Ying Chen and Xiaoli Gong and Jin
Zhang and Wenwen Wang and Pen-Chung Yew",
title = "{JiuJITsu}: Removing Gadgets with Safe Register
Allocation for {JIT} Code Generation",
journal = j-TACO,
volume = "21",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631526",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3631526",
abstract = "Code-reuse attacks have the capability to craft
malicious instructions from small code fragments,
commonly referred to as ``gadgets.'' These gadgets are
generated by JIT (Just-In-Time) engines as integral
components of native instructions, with the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tayeb:2024:AAV,
author = "Hayfa Tayeb and Ludovic Paillat and B{\'e}renger
Bramas",
title = "{Autovesk}: Automatic Vectorized Code Generation from
Unstructured Static Kernels Using Graph
Transformations",
journal = j-TACO,
volume = "21",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631709",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3631709",
abstract = "Leveraging the SIMD capability of modern CPU
architectures is mandatory to take full advantage of
their increased performance. To exploit this
capability, binary executables must be vectorized,
either manually by developers or automatically by a
tool. For \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:FCM,
author = "Xueying Wang and Guangli Li and Zhen Jia and Xiaobing
Feng and Yida Wang",
title = "Fast Convolution Meets Low Precision: Exploring
Efficient Quantized {Winograd} Convolution on Modern
{CPUs}",
journal = j-TACO,
volume = "21",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632956",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632956",
abstract = "Low-precision computation has emerged as one of the
most effective techniques for accelerating
convolutional neural networks and has garnered
widespread support on modern hardware. Despite its
effectiveness in accelerating convolutional neural
networks, low-precision computation has not been
commonly applied to fast convolutions, such as the
Winograd algorithm, due to numerical issues. In this
article, we propose an effective quantized Winograd
convolution, named LoWino, which employs an in-side
quantization method in the Winograd domain to reduce
the precision loss caused by transformations.
Meanwhile, we present an efficient implementation that
integrates well-designed optimization techniques,
allowing us to fully exploit the capabilities of
low-precision computation on modern CPUs. We evaluate
LoWino on two Intel Xeon Scalable Processor platforms
with representative convolutional layers and neural
network models. The experimental results demonstrate
that our approach can achieve an average of $ 1.84
\times $ and $ 1.91 \times $ operator speedups over
state-of-the-art implementations in the vendor library
while preserving accuracy loss at a reasonable level.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fan:2024:QPQ,
author = "Hao Fan and Yiliang Ye and Shadi Ibrahim and Zhuo
Huang and Xingru Li and Weibin Xue and Song Wu and Chen
Yu and Xuanhua Shi and Hai Jin",
title = "{QoS-pro}: a {QoS}-enhanced Transaction Processing
Framework for Shared {SSDs}",
journal = j-TACO,
volume = "21",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632955",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632955",
abstract = "Solid State Drives (SSDs) are widely used in
data-intensive scenarios due to their high performance
and decreasing cost. However, in shared environments,
concurrent workloads can interfere with each other,
leading to a violation of Quality of Service (QoS).
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2024:SUE,
author = "Yunping Zhao and Sheng Ma and Heng Liu and Libo Huang
and Yi Dai",
title = "{SAC}: an Ultra-Efficient Spin-based Architecture for
Compressed {DNNs}",
journal = j-TACO,
volume = "21",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632957",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632957",
abstract = "Deep Neural Networks (DNNs) have achieved great
progress in academia and industry. But they have become
computational and memory intensive with the increase of
network depth. Previous designs seek breakthroughs in
software and hardware levels to mitigate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2024:ECP,
author = "Tong-Yu Liu and Jianmei Guo and Bo Huang",
title = "Efficient Cross-platform Multiplexing of Hardware
Performance Counters via Adaptive Grouping",
journal = j-TACO,
volume = "21",
number = "1",
pages = "8:1--8:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3629525",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3629525",
abstract = "Collecting sufficient microarchitecture performance
data is essential for performance evaluation and
workload characterization. There are many events to be
monitored in a modern processor while only a few
hardware performance monitoring counters (PMCs)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2024:QHQ,
author = "Lei Liu and Xinglei Dou",
title = "{QuCloud+}: a Holistic Qubit Mapping Scheme for
Single\slash Multi-programming on {$2$D\slash $3$D
NISQ} Quantum Computers",
journal = j-TACO,
volume = "21",
number = "1",
pages = "9:1--9:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631525",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3631525",
abstract = "Qubit mapping for NISQ superconducting quantum
computers is essential to fidelity and resource
utilization. The existing qubit mapping schemes meet
challenges, e.g., crosstalk, SWAP overheads, diverse
device topologies, etc., leading to qubit resource
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2024:AAM,
author = "Lingxi Wu and Minxuan Zhou and Weihong Xu and Ashish
Venkat and Tajana Rosing and Kevin Skadron",
title = "{Abakus}: Accelerating $k$-mer Counting with Storage
Technology",
journal = j-TACO,
volume = "21",
number = "1",
pages = "10:1--10:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632952",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632952",
abstract = "This work seeks to leverage
Processing-with-storage-technology (PWST) to accelerate
a key bioinformatics kernel called $k$-mer counting,
which involves processing large files of sequence data
on the disk to build a histogram of fixed-size genome
sequence \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kang:2024:IAG,
author = "Seokwon Kang and Jongbin Kim and Gyeongyong Lee and
Jeongmyung Lee and Jiwon Seo and Hyungsoo Jung and Yong
Ho Song and Yongjun Park",
title = "{ISP Agent}: a Generalized In-storage-processing
Workload Offloading Framework by Providing Multiple
Optimization Opportunities",
journal = j-TACO,
volume = "21",
number = "1",
pages = "11:1--11:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632951",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632951",
abstract = "As solid-state drives (SSDs) with sufficient computing
power have recently become the dominant devices in
modern computer systems, in-storage processing (ISP),
which processes data within the storage without
transferring it to the host memory, is being \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mishra:2024:CHP,
author = "Prasoon Mishra and V. Krishna Nandivada",
title = "{COWS} for High Performance: Cost Aware Work Stealing
for Irregular Parallel Loop",
journal = j-TACO,
volume = "21",
number = "1",
pages = "12:1--12:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633331",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3633331",
abstract = "Parallel libraries such as OpenMP distribute the
iterations of parallel-for-loops among the threads,
using a programmer-specified scheduling policy. While
the existing scheduling policies perform reasonably
well in the context of balanced workloads, in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Park:2024:HHS,
author = "Joongun Park and Seunghyo Kang and Sanghyeon Lee and
Taehoon Kim and Jongse Park and Youngjin Kwon and
Jaehyuk Huh",
title = "Hardware-hardened Sandbox Enclaves for Trusted
Serverless Computing",
journal = j-TACO,
volume = "21",
number = "1",
pages = "13:1--13:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632954",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632954",
abstract = "In cloud-based serverless computing, an application
consists of multiple functions provided by mutually
distrusting parties. For secure serverless computing,
the hardware-based trusted execution environment (TEE)
can provide strong isolation among \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Allen:2024:FGQ,
author = "Tyler Allen and Bennett Cooper and Rong Ge",
title = "Fine-grain Quantitative Analysis of Demand Paging in
Unified Virtual Memory",
journal = j-TACO,
volume = "21",
number = "1",
pages = "14:1--14:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632953",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632953",
abstract = "The abstraction of a shared memory space over separate
CPU and GPU memory domains has eased the burden of
portability for many HPC codebases. However, users pay
for ease of use provided by system-managed memory with
a moderate-to-high performance \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:RRR,
author = "Zhonghua Wang and Yixing Guo and Kai Lu and Jiguang
Wan and Daohui Wang and Ting Yao and Huatao Wu",
title = "{Rcmp}: Reconstructing {RDMA-Based} Memory
Disaggregation via {CXL}",
journal = j-TACO,
volume = "21",
number = "1",
pages = "15:1--15:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634916",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3634916",
abstract = "Memory disaggregation is a promising architecture for
modern datacenters that separates compute and memory
resources into independent pools connected by
ultra-fast networks, which can improve memory
utilization, reduce cost, and enable elastic scaling of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Long:2024:WZW,
author = "Linbo Long and Shuiyong He and Jingcheng Shen and
Renping Liu and Zhenhua Tan and Congming Gao and Duo
Liu and Kan Zhong and Yi Jiang",
title = "{WA-Zone}: Wear-Aware Zone Management Optimization for
{LSM}-Tree on {ZNS SSDs}",
journal = j-TACO,
volume = "21",
number = "1",
pages = "16:1--16:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637488",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3637488",
abstract = "ZNS SSDs divide the storage space into
sequential-write zones, reducing costs of DRAM
utilization, garbage collection, and over-provisioning.
The sequential-write feature of zones is well-suited
for LSM-based databases, where random writes are
organized \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fan:2024:IUD,
author = "Zhihua Fan and Wenming Li and Zhen Wang and Yu Yang
and Xiaochun Ye and Dongrui Fan and Ninghui Sun and
Xuejun An",
title = "Improving Utilization of Dataflow Unit for Multi-Batch
Processing",
journal = j-TACO,
volume = "21",
number = "1",
pages = "17:1--17:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637906",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3637906",
abstract = "Dataflow architectures can achieve much better
performance and higher efficiency than general-purpose
core, approaching the performance of a specialized
design while retaining programmability. However,
advanced application scenarios place higher demands
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2024:EVI,
author = "Dunbo Zhang and Qingjie Lang and Ruoxi Wang and Li
Shen",
title = "Extension {VM}: Interleaved Data Layout in Vector
Memory",
journal = j-TACO,
volume = "21",
number = "1",
pages = "18:1--18:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631528",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3631528",
abstract = "While vector architecture is widely employed in
processors for neural networks, signal processing, and
high-performance computing; however, its performance is
limited by inefficient column-major memory access. The
column-major access limitation originates \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Firtina:2024:AAP,
author = "Can Firtina and Kamlesh Pillai and Gurpreet S. Kalsi
and Bharathwaj Suresh and Damla Senol Cali and Jeremie
S. Kim and Taha Shahroodi and Meryem Banu Cavlak and
Jo{\"e}l Lindegger and Mohammed Alser and Juan
G{\'o}mez Luna and Sreenivas Subramoney and Onur
Mutlu",
title = "{ApHMM}: Accelerating Profile Hidden {Markov} Models
for Fast and Energy-efficient Genome Analysis",
journal = j-TACO,
volume = "21",
number = "1",
pages = "19:1--19:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632950",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3632950",
abstract = "Profile hidden Markov models (pHMMs) are widely
employed in various bioinformatics applications to
identify similarities between biological sequences,
such as DNA or protein sequences. In pHMMs, sequences
are represented as graph structures, where states
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahmad:2024:EDL,
author = "Khalid Ahmad and Cris Cecka and Michael Garland and
Mary Hall",
title = "Exploring Data Layout for Sparse Tensor Times Dense
Matrix on {GPUs}",
journal = j-TACO,
volume = "21",
number = "1",
pages = "20:1--20:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633462",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Feb 23 16:28:09 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3633462",
abstract = "An important sparse tensor computation is
sparse-tensor-dense-matrix multiplication (SpTM), which
is used in tensor decomposition and applications. SpTM
is a multi-dimensional analog to
sparse-matrix-dense-matrix multiplication (SpMM). In
this article, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mummidi:2024:HES,
author = "Chandra Sekhar Mummidi and Victor C. Ferreira and
Sudarshan Srinivasan and Sandip Kundu",
title = "Highly Efficient Self-checking Matrix Multiplication
on Tiled {AMX} Accelerators",
journal = j-TACO,
volume = "21",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633332",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3633332",
abstract = "General Matrix Multiplication (GEMM) is a
computationally expensive operation that is used in
many applications such as machine learning. Hardware
accelerators are increasingly popular for speeding up
GEMM computation, with Tiled Matrix Multiplication
(TMUL) in recent Intel processors being an example.
Unfortunately, the TMUL hardware is susceptible to
errors, necessitating online error detection. The
Algorithm-based Error Detection (ABED) technique is a
powerful technique to detect errors in matrix
multiplications. In this article, we consider
implementation of an ABED technique that integrates
seamlessly with the TMUL hardware to minimize
performance overhead. Unfortunately, rounding errors
introduced by floating-point operations do not allow a
straightforward implementation of ABED in TMUL.
Previously an error bound was considered for addressing
rounding errors in ABED. If the error detection
threshold is set too low, it will a trigger false
alarm, while a loose bound will allow errors to escape
detection. In this article, we propose an adaptive
error threshold that takes into account the TMUL input
values to address the problem of false triggers and
error escapes and provide a taxonomy of various error
classes. This threshold is obtained from theoretical
error analysis but is not easy to implement in
hardware. Consequently, we relax the threshold such
that it can be easily computed in hardware. While ABED
ensures error-free computation, it does not guarantee
full coverage of all hardware faults. To address this
problem, we propose an algorithmic pattern generation
technique to ensure full coverage for all hardware
faults. To evaluate the benefits of our proposed
solution, we conducted fault injection experiments and
show that our approach does not produce any false
alarms or detection escapes for observable errors. We
conducted additional fault injection experiments on a
Deep Neural Network (DNN) model and find that if a
fault is not detected, it does not cause any
misclassification.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:WWO,
author = "Zhonghua Wang and Chen Ding and Fengguang Song and Kai
Lu and Jiguang Wan and Zhihu Tan and Changsheng Xie and
Guokuan Li",
title = "{WIPE}: a Write-Optimized Learned Index for Persistent
Memory",
journal = j-TACO,
volume = "21",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634915",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3634915",
abstract = "Learned Index, which utilizes effective machine
learning models to accelerate locating sorted data
positions, has gained increasing attention in many big
data scenarios. Using efficient learned models, the
learned indexes build large nodes and flat \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chacon:2024:CAC,
author = "Gino A. Chacon and Charles Williams and Johann
Knechtel and Ozgur Sinanoglu and Paul V. Gratz and
Vassos Soteriou",
title = "Coherence Attacks and Countermeasures in
Interposer-based Chiplet Systems",
journal = j-TACO,
volume = "21",
number = "2",
pages = "23:1--23:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633461",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3633461",
abstract = "Industry is moving towards large-scale hardware
systems that bundle processor cores, memories,
accelerators, and so on. via 2.5D integration. These
components are fabricated separately as chiplets and
then integrated using an interposer as an interconnect
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wei:2024:CCB,
author = "Yan Wei and Zhang Xingjun",
title = "A Concise Concurrent {B+}-Tree for Persistent Memory",
journal = j-TACO,
volume = "21",
number = "2",
pages = "24:1--24:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3638717",
abstract = "Persistent memory (PM) presents a unique opportunity
for designing data management systems that offer
improved performance, scalability, and instant restart
capability. As a widely used data structure for
managing data in such systems, B$^+$ -Tree must
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Qararyah:2024:EHD,
author = "Fareed Qararyah and Muhammad Waqar Azhar and Pedro
Trancoso",
title = "An Efficient Hybrid Deep Learning Accelerator for
Compact and Heterogeneous {CNNs}",
journal = j-TACO,
volume = "21",
number = "2",
pages = "25:1--25:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639823",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3639823",
abstract = "Resource-efficient Convolutional Neural Networks
(CNNs) are gaining more attention. These CNNs have
relatively low computational and memory requirements. A
common denominator among such CNNs is having more
heterogeneity than traditional CNNs. This \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Santos:2024:AIC,
author = "Fernando {Fernandes Dos Santos} and Luigi Carro and
Flavio Vella and Paolo Rech",
title = "Assessing the Impact of Compiler Optimizations on
{GPUs} Reliability",
journal = j-TACO,
volume = "21",
number = "2",
pages = "26:1--26:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638249",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3638249",
abstract = "Graphics Processing Units (GPUs) compilers have
evolved in order to support general-purpose programming
languages for multiple architectures. NVIDIA CUDA
Compiler (NVCC) has many compilation levels before
generating the machine code and applies complex
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Isaac-Chassande:2024:DHA,
author = "Valentin Isaac-Chassande and Adrian Evans and Yves
Durand and Fr{\'e}d{\'e}ric Rousseau",
title = "Dedicated Hardware Accelerators for Processing of
Sparse Matrices and Vectors: a Survey",
journal = j-TACO,
volume = "21",
number = "2",
pages = "27:1--27:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640542",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3640542",
abstract = "Performance in scientific and engineering applications
such as computational physics, algebraic graph problems
or Convolutional Neural Networks (CNN), is dominated by
the manipulation of large sparse matrices-matrices with
a large number of zero elements. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xie:2024:IIA,
author = "Benyi Xie and Yue Yan and Chenghao Yan and Sicheng Tao
and Zhuangzhuang Zhang and Xinyu Li and Yanzhi Lan and
Xiang Wu and Tianyi Liu and Tingting Zhang and Fuxin
Zhang",
title = "An Instruction Inflation Analyzing Framework for
Dynamic Binary Translators",
journal = j-TACO,
volume = "21",
number = "2",
pages = "28:1--28:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640813",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3640813",
abstract = "Dynamic binary translators (DBTs) are widely used to
migrate applications between different instruction set
architectures (ISAs). Despite extensive research to
improve DBT performance, noticeable overhead remains,
preventing near-native performance, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rac:2024:CAS,
author = "Samuel Rac and Mats Brorsson",
title = "Cost-aware Service Placement and Scheduling in the
Edge-Cloud Continuum",
journal = j-TACO,
volume = "21",
number = "2",
pages = "29:1--29:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640823",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3640823",
abstract = "The edge to data center computing continuum is the
aggregation of computing resources located anywhere
between the network edge (e.g., close to 5G antennas),
and servers in traditional data centers. Kubernetes is
the de facto standard for the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xue:2024:TEG,
author = "Feng Xue and Chenji Han and Xinyu Li and Junliang Wu
and Tingting Zhang and Tianyi Liu and Yifan Hao and
Zidong Du and Qi Guo and Fuxin Zhang",
title = "{Tyche}: an Efficient and General Prefetcher for
Indirect Memory Accesses",
journal = j-TACO,
volume = "21",
number = "2",
pages = "30:1--30:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3641853",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3641853",
abstract = "Indirect memory accesses (IMAs, i.e., A [ f ( B [ i
])]) are typical memory access patterns in applications
such as graph analysis, machine learning, and database.
IMAs are composed of producer-consumer pairs, where the
consumers' memory addresses are derived \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xie:2024:WLT,
author = "Kunpeng Xie and Ye Lu and Xinyu He and Dezhi Yi and
Huijuan Dong and Yao Chen",
title = "{Winols}: a Large-Tiling Sparse {Winograd} {CNN}
Accelerator on {FPGAs}",
journal = j-TACO,
volume = "21",
number = "2",
pages = "31:1--31:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3643682",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3643682",
abstract = "Convolutional Neural Networks (CNNs) can benefit from
the computational reductions provided by the Winograd
minimal filtering algorithm and weight pruning.
However, harnessing the potential of both methods
simultaneously introduces complexity in designing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2024:SSR,
author = "Ke Liu and Kan Wu and Hua Wang and Ke Zhou and Peng
Wang and Ji Zhang and Cong Li",
title = "{SLAP}: Segmented Reuse-Time-Label Based Admission
Policy for Content Delivery Network Caching",
journal = j-TACO,
volume = "21",
number = "2",
pages = "32:1--32:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3646550",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3646550",
abstract = "``Learned'' admission policies have shown promise in
improving Content Delivery Network (CDN) cache
performance and lowering operational costs.
Unfortunately, existing learned policies are optimized
with a few fixed cache sizes while in reality, cache
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Miliadis:2024:ASS,
author = "Panagiotis Miliadis and Dimitris Theodoropoulos and
Dionisios Pnevmatikatos and Nectarios Koziris",
title = "Architectural Support for Sharing, Isolating and
Virtualizing {FPGA} Resources",
journal = j-TACO,
volume = "21",
number = "2",
pages = "33:1--33:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3648475",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3648475",
abstract = "FPGAs are increasingly popular in cloud environments
for their ability to offer on-demand acceleration and
improved compute efficiency. Providers would like to
increase utilization, by multiplexing customers on a
single device, similar to how processing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Du:2024:FDR,
author = "Haitao Du and Yuhan Qin and Song Chen and Yi Kang",
title = "{FASA-DRAM}: Reducing {DRAM} Latency with Destructive
Activation and Delayed Restoration",
journal = j-TACO,
volume = "21",
number = "2",
pages = "34:1--34:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3649455",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3649455",
abstract = "DRAM memory is a performance bottleneck for many
applications, due to its high access latency. Previous
work has mainly focused on data locality, introducing
small but fast regions to cache frequently accessed
data, thereby reducing the average latency. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Canesche:2024:DSA,
author = "Michael Canesche and Vanderson Ros{\'a}rio and Edson
Borin and Fernando Quint{\~a}o Pereira",
title = "The Droplet Search Algorithm for Kernel Scheduling",
journal = j-TACO,
volume = "21",
number = "2",
pages = "35:1--35:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3650109",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3650109",
abstract = "Kernel scheduling is the problem of finding the most
efficient implementation for a computational kernel.
Identifying this implementation involves experimenting
with the parameters of compiler optimizations, such as
the size of tiling windows and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pal:2024:CUA,
author = "Asmita Pal and Keerthana Desai and Rahul Chatterjee
and Joshua {San Miguel}",
title = "{Camouflage}: Utility-Aware Obfuscation for Accurate
Simulation of Sensitive Program Traces",
journal = j-TACO,
volume = "21",
number = "2",
pages = "36:1--36:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3650110",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3650110",
abstract = "Trace-based simulation is a widely used methodology
for system design exploration. It relies on realistic
traces that represent a range of behaviors necessary to
be evaluated, containing a lot of information about the
application, its inputs and the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huan:2024:TNT,
author = "Chengying Huan and Yongchao Liu and Heng Zhang and
Shuaiwen Song and Santosh Pandey and Shiyang Chen and
Xiangfei Fang and Yue Jin and Baptiste Lepers and
Yanjun Wu and Hang Liu",
title = "{TEA+}: a Novel Temporal Graph Random Walk Engine with
Hybrid Storage Architecture",
journal = j-TACO,
volume = "21",
number = "2",
pages = "37:1--37:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3652604",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3652604",
abstract = "Many real-world networks are characterized by being
temporal and dynamic, wherein the temporal information
signifies the changes in connections, such as the
addition or removal of links between nodes. Employing
random walks on these temporal networks is a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hwang:2024:CTM,
author = "Soojin Hwang and Daehyeon Baek and Jongse Park and
Jaehyuk Huh",
title = "{Cerberus}: Triple Mode Acceleration of Sparse Matrix
and Vector Multiplication",
journal = j-TACO,
volume = "21",
number = "2",
pages = "38:1--38:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3653020",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3653020",
abstract = "The multiplication of sparse matrix and vector (SpMV)
is one of the most widely used kernels in
high-performance computing as well as machine learning
acceleration for sparse neural networks. The design
space of SpMV accelerators has two axes: algorithm
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Raman:2024:NGD,
author = "Siddhartha Raman Sundara Raman and Lizy John and
Jaydeep P. Kulkarni",
title = "{NEM-GNN}: {DAC\slash ADC}-less, Scalable,
Reconfigurable, Graph and Sparsity-Aware Near-Memory
Accelerator for Graph Neural Networks",
journal = j-TACO,
volume = "21",
number = "2",
pages = "39:1--39:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3652607",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3652607",
abstract = "Graph neural networks (GNNs) are of great interest in
real-life applications such as citation networks and
drug discovery owing to GNN's ability to apply machine
learning techniques on graphs. GNNs utilize a two-step
approach to classify the nodes in a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2024:XSH,
author = "Yan Chen and Qiwen Ke and Huiba Li and Yongwei Wu and
Yiming Zhang",
title = "{xMeta}: {SSD-HDD}-hybrid Optimization for Metadata
Maintenance of Cloud-scale Object Storage",
journal = j-TACO,
volume = "21",
number = "2",
pages = "40:1--40:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3652606",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3652606",
abstract = "Object storage has been widely used in the cloud.
Traditionally, the size of object metadata is much
smaller than that of object data, and thus existing
object storage systems (such as Ceph and Oasis) can
place object data and metadata, respectively, on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Singhal:2024:OHP,
author = "Vidush Singhal and Laith Sakka and Kirshanthan
Sundararajah and Ryan Newton and Milind Kulkarni",
title = "{Orchard}: Heterogeneous Parallelism and Fine-grained
Fusion for Complex Tree Traversals",
journal = j-TACO,
volume = "21",
number = "2",
pages = "41:1--41:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3652605",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 27 06:59:33 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3652605",
abstract = "Many applications are designed to perform traversals
on tree-like data structures. Fusing and parallelizing
these traversals enhance the performance of
applications. Fusing multiple traversals improves the
locality of the application. The runtime of an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Falahati:2024:CCD,
author = "Hajar Falahati and Mohammad Sadrosadati and Qiumin Xu
and Juan G{\'o}mez-Luna and Banafsheh Saber Latibari
and Hyeran Jeon and Shaahin Hesaabi and Hamid
Sarbazi-Azad and Onur Mutlu and Murali Annavaram and
Masoud Pedram",
title = "Cross-core Data Sharing for Energy-efficient {GPUs}",
journal = j-TACO,
volume = "21",
number = "3",
pages = "42:1--42:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3653019",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3653019",
abstract = "Graphics Processing Units (GPUs) are the accelerator
of choice in a variety of application domains, because
they can accelerate massively parallel workloads and
can be easily programmed using general-purpose
programming frameworks such as CUDA and OpenCL. Each
Streaming Multiprocessor (SM) contains an L1 data cache
(L1D) to exploit the locality in data accesses. L1D
misses are costly for GPUs for two reasons. First, L1D
misses consume a lot of energy as they need to access
the L2 cache (L2) via an on-chip network and the
off-chip DRAM in case of L2 misses. Second, L1D misses
impose performance overhead if the GPU does not have
enough active warps to hide the long memory access
latency. We observe that threads running on different
SMs share 55\% of the data they read from the memory.
Unfortunately, as the L1Ds are in the non-coherent
memory domain, each SM independently fetches data from
the L2 or the off-chip memory into its L1D, even though
the data may be currently available in the L1D of
another SM. Our goal is to service L1D read misses via
other SMs, as much as possible, to cut down costly
accesses to the L2 or the off-chip DRAM. To this end,
we propose a new data-sharing mechanism, called
Cross-Core Data Sharing (CCDS). CCDS employs a
predictor to estimate whether the required cache block
exists in another SM. If the block is predicted to
exist in another SM's L1D, then CCDS fetches the data
from the L1D that contain the block. Our experiments on
a suite of 26 workloads show that CCDS improves average
energy and performance by 1.30$ \times $ and 1.20$
\times $, respectively, compared to the baseline GPU.
Compared to the state-of-the-art data-sharing
mechanism, CCDS improves average energy and performance
by 1.37$ \times $ and 1.11$ \times $, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2024:RRS,
author = "Ching-Jui Lee and Tsung Tai Yeh",
title = "{ReSA}: Reconfigurable Systolic Array for Multiple
Tiny {DNN} Tensors",
journal = j-TACO,
volume = "21",
number = "3",
pages = "43:1--43:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3653363",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3653363",
abstract = "Systolic array architecture has significantly
accelerated deep neural networks (DNNs). A systolic
array comprises multiple processing elements (PEs) that
can perform multiply-accumulate (MAC). Traditionally,
the systolic array can execute a certain amount
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:EPM,
author = "Ziheng Wang and Xiaoshe Dong and Yan Kang and Heng
Chen and Qiang Wang",
title = "An Example of Parallel {Merkle} Tree Traversal:
Post-Quantum {Leighton--Micali} Signature on the
{GPU}",
journal = j-TACO,
volume = "21",
number = "3",
pages = "44:1--44:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3659209",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3659209",
abstract = "The hash-based signature (HBS) is the most
conservative and time-consuming among many post-quantum
cryptography (PQC) algorithms. Two HBSs, LMS and XMSS,
are the only PQC algorithms standardised by the
National Institute of Standards and Technology (NIST)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2024:KAM,
author = "Jiang Wu and Zhuo Zhang and Deheng Yang and Jianjun Xu
and Jiayu He and Xiaoguang Mao",
title = "Knowledge-Augmented Mutation-Based Bug Localization
for Hardware Design Code",
journal = j-TACO,
volume = "21",
number = "3",
pages = "45:1--45:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3660526",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3660526",
abstract = "Verification of hardware design code is crucial for
the quality assurance of hardware products. Being an
indispensable part of verification, localizing bugs in
the hardware design code is significant for hardware
development but is often regarded as a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ding:2024:DCE,
author = "Chen Ding and Jian Zhou and Kai Lu and Sicen Li and
Yiqin Xiong and Jiguang Wan and Ling Zhan",
title = "{D$^2$Comp}: Efficient Offload of {LSM}-tree
Compaction with Data Processing Units on Disaggregated
Storage",
journal = j-TACO,
volume = "21",
number = "3",
pages = "46:1--46:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3656584",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3656584",
abstract = "LSM-based key-value stores suffer from sub-optimal
performance due to their slow and heavy background
compactions. The compaction brings severe CPU and
network overhead on high-speed disaggregated storage.
This article further reveals that data-intensive
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:INM,
author = "Zhuohao Wang and Lei Liu and Limin Xiao",
title = "{iSwap}: a New Memory Page Swap Mechanism for Reducing
Ineffective {I/O} Operations in Cloud Environments",
journal = j-TACO,
volume = "21",
number = "3",
pages = "47:1--47:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3653302",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3653302",
abstract = "This article proposes iSwap, a new memory page swap
mechanism that reduces the ineffective I/O swap
operations and improves the QoS for applications with a
high priority in cloud environments. iSwap works in the
OS kernel. iSwap accurately learns the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2024:GDA,
author = "Junkaixuan Li and Yi Kang",
title = "{GraphSER}: Distance-Aware Stream-Based Edge
Repartition for Many-Core Systems",
journal = j-TACO,
volume = "21",
number = "3",
pages = "48:1--48:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3661998",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3661998",
abstract = "With the explosive growth of graph data, distributed
graph processing has become popular, and many graph
hardware accelerators use distributed frameworks. Graph
partitioning is foundation in distributed graph
processing. However, dynamic changes in graph
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2024:CNI,
author = "Ke Wu and Dezun Dong and Weixia Xu",
title = "{COER}: a Network Interface Offloading Architecture
for {RDMA} and Congestion Control Protocol Codesign",
journal = j-TACO,
volume = "21",
number = "3",
pages = "49:1--49:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3660525",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3660525",
abstract = "RDMA (Remote Direct Memory Access) networks require
efficient congestion control to maintain their high
throughput and low latency characteristics. However,
congestion control protocols deployed at the software
layer suffer from slow response times due to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2024:IAS,
author = "Qunyou Liu and Darong Huang and Luis Costero and
Marina Zapater and David Atienza",
title = "Intermediate Address Space: virtual memory
optimization of heterogeneous architectures for
cache-resident workloads",
journal = j-TACO,
volume = "21",
number = "3",
pages = "50:1--50:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3659207",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3659207",
abstract = "The increasing demand for computing power and the
emergence of heterogeneous computing architectures have
driven the exploration of innovative techniques to
address current limitations in both the compute and
memory subsystems. One such solution is the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Min:2024:CCE,
author = "Dongmoon Min and Ilkwon Byun and Gyu-Hyeon Lee and
Jangwoo Kim",
title = "{CoolDC}: a Cost-Effective Immersion-Cooled Datacenter
with Workload-Aware Temperature Scaling",
journal = j-TACO,
volume = "21",
number = "3",
pages = "51:1--51:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664925",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3664925",
abstract = "For datacenter architects, it is the most important
goal to minimize the datacenter's total cost of
ownership for the target performance (i.e.,
TCO/performance). As the major component of a
datacenter is a server farm, the most effective way of
reducing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2024:SSA,
author = "Hai Zhou and Dan Feng",
title = "Stripe-schedule Aware Repair in Erasure-coded Clusters
with Heterogeneous Star Networks",
journal = j-TACO,
volume = "21",
number = "3",
pages = "52:1--52:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664926",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3664926",
abstract = "More and more storage systems use erasure code to
tolerate faults. It takes pieces of data blocks as
input and encodes a small number of parity blocks as
output, where these blocks form a stripe. When
reconsidering the recovery problem in the multi-stripe
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Deng:2024:FPE,
author = "Bobin Deng and Bhargava Nadendla and Kun Suo and Yixin
Xie and Dan Chia-Tien Lo",
title = "Fixed-point Encoding and Architecture Exploration for
Residue Number Systems",
journal = j-TACO,
volume = "21",
number = "3",
pages = "53:1--53:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664923",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3664923",
abstract = "Residue Number Systems (RNS) demonstrate the
fascinating potential to serve integer addition\slash
multiplication-intensive applications. The complexity
of Artificial Intelligence (AI) models has grown
enormously in recent years. From a computer system's
perspective, ensuring the training of these large-scale
AI models within an adequate time and energy
consumption has become a big concern. Matrix
multiplication is a dominant subroutine in many
prevailing AI models, with an addition\slash
multiplication-intensive attribute. However, the data
type of matrix multiplication within machine learning
training typically requires real numbers, which
indicates that RNS benefits for integer applications
cannot be directly gained by AI training. The
state-of-the-art RNS real-number encodings, including
floating-point and fixed-point, have defects and can be
further enhanced. To transform default RNS benefits to
the efficiency of large-scale AI training, we propose a
low-cost and high-accuracy RNS fixed-point
representation: Single RNS Logical Partition
(S-RNS-Logic-P) representation with Scaling-down
Postprocessing Multiplication (SD-Post-Mul). Moreover,
we extend the implementation details of the other two
RNS fixed-point methods: Double RNS Concatenation and
S-RNS-Logic-P representation with Scaling-down
Preprocessing Multiplication. We also design the
architectures of these three fixed-point multipliers.
In empirical experiments, our S-RNS-Logic-P
representation with SD-Post-Mul method achieves less
latency and energy overhead while maintaining good
accuracy. Furthermore, this method can easily extend to
the Redundant Residue Number System to raise the
efficiency of error-tolerant domains, such as improving
the error correction efficiency of quantum computing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:OSM,
author = "Yizhuo Wang and Fangli Chang and Bingxin Wei and
Jianhua Gao and Weixing Ji",
title = "Optimization of Sparse Matrix Computation for
Algebraic Multigrid on {GPUs}",
journal = j-TACO,
volume = "21",
number = "3",
pages = "54:1--54:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664924",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3664924",
abstract = "AMG is one of the most efficient and widely used
methods for solving sparse linear systems. The
computational process of AMG mainly consists of a
series of iterative calculations of generalized sparse
matrix-matrix multiplication (SpGEMM) and sparse
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2024:AMA,
author = "Luming Wang and Xu Zhang and Songyue Wang and Zhuolun
Jiang and Tianyue Lu and Mingyu Chen and Siwei Luo and
Keji Huang",
title = "Asynchronous Memory Access Unit: Exploiting Massive
Parallelism for Far Memory Access",
journal = j-TACO,
volume = "21",
number = "3",
pages = "55:1--55:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663479",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3663479",
abstract = "The growing memory demands of modern applications have
driven the adoption of far memory technologies in data
centers to provide cost-effective, high-capacity memory
solutions. However, far memory presents new performance
challenges because its access \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2024:SOD,
author = "Yunping Zhao and Sheng Ma and Hengzhu Liu and
Dongsheng Li",
title = "{SAL}: Optimizing the Dataflow of Spin-based
Architectures for Lightweight Neural Networks",
journal = j-TACO,
volume = "21",
number = "3",
pages = "56:1--56:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3673654",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3673654",
abstract = "As the Convolutional Neural Network (CNN) goes deeper
and more complex, the network becomes memory-intensive
and computation-intensive. To address this issue, the
lightweight neural network reduces parameters and
Multiplication-and-Accumulation (MAC) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lu:2024:SLL,
author = "Kai Lu and Siqi Zhao and Haikang Shan and Qiang Wei
and Guokuan Li and Jiguang Wan and Ting Yao and Huatao
Wu and Daohui Wang",
title = "{Scythe}: a Low-latency {RDMA}-enabled Distributed
Transaction System for Disaggregated Memory",
journal = j-TACO,
volume = "21",
number = "3",
pages = "57:1--57:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3666004",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3666004",
abstract = "Disaggregated memory separates compute and memory
resources into independent pools connected by RDMA
(Remote Direct Memory Access) networks, which can
improve memory utilization, reduce cost, and enable
elastic scaling of compute and memory resources.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Peng:2024:LER,
author = "Wangqi Peng and Yusen Li and Xiaoguang Liu and Gang
Wang",
title = "{Lavender}: an Efficient Resource Partitioning
Framework for Large-Scale Job Colocation",
journal = j-TACO,
volume = "21",
number = "3",
pages = "58:1--58:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674736",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3674736",
abstract = "Workload consolidation is a widely used approach to
enhance resource utilization in modern data centers.
However, the concurrent execution of multiple jobs on a
shared server introduces contention for essential
shared resources such as CPU cores, Last \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2024:ATE,
author = "Feng Zhang and Fulin Nan and Binbin Xu and Zhirong
Shen and Jiebin Zhai and Dmitrii Kalplun and Jiwu Shu",
title = "Achieving Tunable Erasure Coding with Cluster-Aware
Redundancy Transitioning",
journal = j-TACO,
volume = "21",
number = "3",
pages = "59:1--59:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672077",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3672077",
abstract = "Erasure coding has been demonstrated as a
storage-efficient means against failures, yet its
tunability remains a challenging issue in data centers,
which is prone to induce substantial cross-cluster
traffic. In this article, we present ClusterRT, a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Olgun:2024:SDP,
author = "Ataberk Olgun and F. Nisa Bostanci and Geraldo
Francisco de Oliveira Junior and Yahya Can Tugrul and
Rahul Bera and Abdullah Giray Yaglikci and Hasan Hassan
and Oguz Ergin and Onur Mutlu",
title = "Sectored {DRAM}: a Practical Energy-Efficient and
High-Performance Fine-Grained {DRAM} Architecture",
journal = j-TACO,
volume = "21",
number = "3",
pages = "60:1--60:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3673653",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3673653",
abstract = "Modern computing systems access data in main memory at
coarse granularity (e.g., at 512-bit cache block
granularity). Coarse-grained access leads to wasted
energy because the system does not use all individually
accessed small portions (e.g., words, each \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wei:2024:RRI,
author = "Xiaohui Wei and Chenyang Wang and Hengshan Yue and
Jingweijia Tan and Zeyu Guan and Nan Jiang and Xinyang
Zheng and Jianpeng Zhao and Meikang Qiu",
title = "{ReIPE}: Recycling Idle {PEs} in {CNN} Accelerator for
Vulnerable Filters Soft-Error Detection",
journal = j-TACO,
volume = "21",
number = "3",
pages = "61:1--61:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674909",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3674909",
abstract = "To satisfy prohibitively massive computational
requirements of current deep Convolutional Neural
Networks (CNNs), CNN-specific accelerators are widely
deployed in large-scale systems. Caused by high-energy
neutrons and $ \alpha $-particle strikes, soft error
may \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2024:COL,
author = "Qiao Li and Yu Chen and Guanyu Wu and Yajuan Du and
Min Ye and Xinbiao Gan and Jie Zhang and Zhirong Shen
and Jiwu Shu and Chun Xue",
title = "Characterizing and Optimizing {LDPC} Performance on
{$3$D} {NAND} Flash Memories",
journal = j-TACO,
volume = "21",
number = "3",
pages = "62:1--62:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663478",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3663478",
abstract = "With the development of NAND flash memories' bit
density and stacking technologies, while storage
capacity keeps increasing, the issue of reliability
becomes increasingly prominent. Low-density parity
check (LDPC) code, as a robust error-correcting code,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xu:2024:RAR,
author = "Jiahong Xu and Haikun Liu and Zhuohui Duan and Xiaofei
Liao and Hai Jin and Xiaokang Yang and Huize Li and
Cong Liu and Fubing Mao and Yu Zhang",
title = "{ReHarvest}: an {ADC} Resource-Harvesting Crossbar
Architecture for {ReRAM}-Based {DNN} Accelerators",
journal = j-TACO,
volume = "21",
number = "3",
pages = "63:1--63:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3659208",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3659208",
abstract = "ReRAM-based Processing-In-Memory (PIM) architectures
have been increasingly explored to accelerate various
Deep Neural Network (DNN) applications because they can
achieve extremely high performance and
energy-efficiency for in-situ analog Matrix-Vector
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2024:TAS,
author = "Jiang Wu and Zhuo Zhang and Deheng Yang and Jianjun Xu
and Jiayu He and Xiaoguang Mao",
title = "Time-Aware Spectrum-Based Bug Localization for
Hardware Design Code with Data Purification",
journal = j-TACO,
volume = "21",
number = "3",
pages = "64:1--64:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3678009",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 21 06:05:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/10.1145/3678009",
abstract = "The verification of hardware design code is a critical
aspect in ensuring the quality and reliability of
hardware products. Finding bugs in hardware design code
is important for hardware development and is frequently
considered as a notoriously \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}