%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%% author = "Nelson H. F. Beebe",
%%% version = "1.14",
%%% date = "28 August 2008",
%%% time = "13:26:22 MDT",
%%% filename = "taco.bib",
%%% address = "University of Utah
%%% Department of Mathematics, 110 LCB
%%% 155 S 1400 E RM 233
%%% Salt Lake City, UT 84112-0090
%%% USA",
%%% telephone = "+1 801 581 5254",
%%% FAX = "+1 801 581 4148",
%%% URL = "http://www.math.utah.edu/~beebe",
%%% checksum = "59331 2653 13011 118493",
%%% email = "beebe at math.utah.edu, beebe at acm.org,
%%% beebe at computer.org (Internet)",
%%% codetable = "ISO/ASCII",
%%% keywords = "ACM Transactions on Architecture and Code
%%% Optimization; bibliography; TACO",
%%% license = "public domain",
%%% supported = "yes",
%%% docstring = "This is a COMPLETE BibTeX bibliography for
%%% ACM Transactions on Architecture and Code
%%% Optimization (CODEN ????, ISSN 1544-3566),
%%% covering all journal issues from 2004 --
%%% date.
%%%
%%% At version 1.14, the COMPLETE journal
%%% coverage looked like this:
%%%
%%% 2004 ( 17) 2006 ( 19) 2008 ( 17)
%%% 2005 ( 17) 2007 ( 19)
%%%
%%% Article: 89
%%%
%%% Total entries: 89
%%%
%%% The journal Web page can be found at:
%%%
%%% http://www.acm.org/pubs/taco.html
%%%
%%% The journal table of contents page is at:
%%%
%%% http://www.acm.org/taco/
%%% http://portal.acm.org/browse_dl.cfm?linked=1&part=transaction&idx=J924
%%%
%%% Qualified subscribers can retrieve the full
%%% text of recent articles in PDF form.
%%%
%%% The initial draft was extracted from the ACM
%%% Web pages.
%%%
%%% ACM copyrights explicitly permit abstracting
%%% with credit, so article abstracts, keywords,
%%% and subject classifications have been
%%% included in this bibliography wherever
%%% available. Article reviews have been
%%% omitted, until their copyright status has
%%% been clarified.
%%%
%%% bibsource keys in the bibliography entries
%%% below indicate the entry originally came
%%% from the computer science bibliography
%%% archive, even though it has likely since
%%% been corrected and updated.
%%%
%%% URL keys in the bibliography point to
%%% World Wide Web locations of additional
%%% information about the entry.
%%%
%%% BibTeX citation tags are uniformly chosen
%%% as name:year:abbrev, where name is the
%%% family name of the first author or editor,
%%% year is a 4-digit number, and abbrev is a
%%% 3-letter condensation of important title
%%% words. Citation tags were automatically
%%% generated by software developed for the
%%% BibNet Project.
%%%
%%% In this bibliography, entries are sorted in
%%% publication order, using ``bibsort -byvolume.''
%%%
%%% The checksum field above contains a CRC-16
%%% checksum as the first value, followed by the
%%% equivalent of the standard UNIX wc (word
%%% count) utility output of lines, words, and
%%% characters. This is produced by Robert
%%% Solovay's checksum utility."
%%% }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
%%% ====================================================================
%%% Journal abbreviations:
@String{j-TACO = "ACM Transactions on Architecture and
Code Optimization"}
%%% ====================================================================
%%% Bibliography entries:
@Article{Calder:2004:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "1",
number = "1",
pages = "1--2",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2004:RIC,
author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
and N. Vijaykrishnan and M. J. Irwin",
title = "Reducing instruction cache energy consumption using a
compiler-based strategy",
journal = j-TACO,
volume = "1",
number = "1",
pages = "3--33",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Isailovic:2004:DCQ,
author = "Nemanja Isailovic and Mark Whitney and Yatish Patel
and John Kubiatowicz and Dean Copsey and Frederic T.
Chong and Isaac L. Chuang and Mark Oskin",
title = "Datapath and control for quantum wires",
journal = j-TACO,
volume = "1",
number = "1",
pages = "34--61",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sankaralingam:2004:TPA,
author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and
Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
Ranganathan and Doug Burger and Stephen W. Keckler and
Robert G. McDonald and Charles R. Moore",
title = "{TRIPS}: {A} polymorphous architecture for exploiting
{ILP}, {TLP}, and {DLP}",
journal = j-TACO,
volume = "1",
number = "1",
pages = "62--93",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Skadron:2004:TAM,
author = "Kevin Skadron and Mircea R. Stan and Karthik
Sankaranarayanan and Wei Huang and Sivakumar Velusamy
and David Tarjan",
title = "Temperature-aware microarchitecture: {Modeling} and
implementation",
journal = j-TACO,
volume = "1",
number = "1",
pages = "94--125",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Aleta:2004:RCC,
author = "Alex Alet{\`a} and Josep M. Codina and Antonio
Gonz{\'a}lez and David Kaeli",
title = "Removing communications in clustered
microarchitectures through instruction replication",
journal = j-TACO,
volume = "1",
number = "2",
pages = "127--151",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Bai:2004:LPO,
author = "Yu Bai and R. Iris Bahar",
title = "A low-power in-order\slash out-of-order issue queue",
journal = j-TACO,
volume = "1",
number = "2",
pages = "152--179",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Juang:2004:IBP,
author = "Philo Juang and Kevin Skadron and Margaret Martonosi
and Zhigang Hu and Douglas W. Clark and Philip W.
Diodato and Stefanos Kaxiras",
title = "Implementing branch-predictor decay using quasi-static
memory cells",
journal = j-TACO,
volume = "1",
number = "2",
pages = "180--219",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Santana:2004:LCF,
author = "Oliverio J. Santana and Alex Ramirez and Josep L.
Larriba-Pey and Mateo Valero",
title = "A low-complexity fetch architecture for
high-performance superscalar processors",
journal = j-TACO,
volume = "1",
number = "2",
pages = "220--245",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Lin:2004:CFS,
author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun
Chan",
title = "A compiler framework for speculative optimizations",
journal = j-TACO,
volume = "1",
number = "3",
pages = "247--271",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Fields:2004:ICS,
author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill
and Chris J. Newburn",
title = "Interaction cost and shotgun profiling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "272--304",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sankaranarayanan:2004:PBA,
author = "Karthik Sankaranarayanan and Kevin Skadron",
title = "Profile-based adaptation for cache decay",
journal = j-TACO,
volume = "1",
number = "3",
pages = "305--322",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Xie:2004:IDV,
author = "Fen Xie and Margaret Martonosi and Sharad Malik",
title = "Intraprogram dynamic voltage scaling: {Bounding}
opportunities with analytic modeling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "323--367",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Hartstein:2004:OPD,
author = "A. Hartstein and Thomas R. Puzak",
title = "The optimum pipeline depth considering both power and
performance",
journal = j-TACO,
volume = "1",
number = "4",
pages = "369--388",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Cristal:2004:TKI,
author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
Valero and Jos{\'e} F. Mart{\'\i}nez",
title = "Toward kilo-instruction processors",
journal = j-TACO,
volume = "1",
number = "4",
pages = "389--417",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Akkary:2004:ARE,
author = "Haitham Akkary and Ravi Rajwar and Srikanth T.
Srinivasan",
title = "An analysis of a resource efficient checkpoint
architecture",
journal = j-TACO,
volume = "1",
number = "4",
pages = "418--444",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Yang:2004:TML,
author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
and Chien-Hao Lee",
title = "Tolerating memory latency through push prefetching for
pointer-intensive applications",
journal = j-TACO,
volume = "1",
number = "4",
pages = "445--475",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Calder:2005:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "2",
number = "1",
pages = "1--2",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhou:2005:EFA,
author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
and Josep Torrellas",
title = "Efficient and flexible architectural support for
dynamic monitoring",
journal = j-TACO,
volume = "2",
number = "1",
pages = "3--33",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2005:WHC,
author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
Najjar",
title = "A way-halting cache for low-energy high-performance
systems",
journal = j-TACO,
volume = "2",
number = "1",
pages = "34--54",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Abella:2005:ISP,
author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
and Michael F. P. O'Boyle",
title = "{IATAC}: a smart predictor to turn-off {L2} cache
lines",
journal = j-TACO,
volume = "2",
number = "1",
pages = "55--77",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Haskins:2005:AWS,
author = "John W. {Haskins, Jr.} and Kevin Skadron",
title = "Accelerated warmup for sampled microarchitecture
simulation",
journal = j-TACO,
volume = "2",
number = "1",
pages = "78--108",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Li:2005:ABT,
author = "Tao Li and Ravi Bhargava and Lizy Kurian John",
title = "Adapting branch-target buffer to improve the target
predictability of {Java} code",
journal = j-TACO,
volume = "2",
number = "2",
pages = "109--130",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2005:DIE,
author = "Lingli Zhang and Chandra Krintz",
title = "The design, implementation, and evaluation of adaptive
code unloading for resource-constrained devices",
journal = j-TACO,
volume = "2",
number = "2",
pages = "131--164",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Kulkarni:2005:FES,
author = "Prasad A. Kulkarni and Stephen R. Hines and David B.
Whalley and Jason D. Hiser and Jack W. Davidson and
Douglas L. Jones",
title = "Fast and efficient searches for effective
optimization-phase sequences",
journal = j-TACO,
volume = "2",
number = "2",
pages = "165--198",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Salami:2005:DMI,
author = "Esther Salam{\'\i} and Mateo Valero",
title = "Dynamic memory interval test vs. interprocedural
pointer analysis in multimedia applications",
journal = j-TACO,
volume = "2",
number = "2",
pages = "199--219",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Meng:2005:ELL,
author = "Yan Meng and Timothy Sherwood and Ryan Kastner",
title = "Exploring the limits of leakage power reduction in
caches",
journal = j-TACO,
volume = "2",
number = "3",
pages = "221--246",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Garzaran:2005:TBS,
author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
Vi{\~n}als and Lawrence Rauchwerger and Josep
Torrellas",
title = "Tradeoffs in buffering speculative memory state for
thread-level speculation in multiprocessors",
journal = j-TACO,
volume = "2",
number = "3",
pages = "247--279",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Tarjan:2005:MPG,
author = "David Tarjan and Kevin Skadron",
title = "Merging path and gshare indexing in perceptron branch
prediction",
journal = j-TACO,
volume = "2",
number = "3",
pages = "280--300",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2005:WET,
author = "Xiangyu Zhang and Rajiv Gupta",
title = "Whole execution traces and their applications",
journal = j-TACO,
volume = "2",
number = "3",
pages = "301--334",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhao:2005:IWA,
author = "Wankang Zhao and David Whalley and Christopher Healy
and Frank Mueller",
title = "Improving {WCET} by applying a {WC} code-positioning
optimization",
journal = j-TACO,
volume = "2",
number = "4",
pages = "335--365",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
keywords = "WCET (worst case execution time); WC (worst case)",
}
@Article{Reis:2005:SCF,
author = "George A. Reis and Jonathan Chang and Neil
Vachharajani and Ram Rangan and David I. August and
Shubhendu S. Mukherjee",
title = "Software-controlled fault tolerance",
journal = j-TACO,
volume = "2",
number = "4",
pages = "366--396",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Li:2005:PPC,
author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
title = "Power-performance considerations of parallel computing
on chip multiprocessors",
journal = j-TACO,
volume = "2",
number = "4",
pages = "397--422",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sharma:2005:SPE,
author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
title = "Spectral prefetcher: {An} effective mechanism for {L2}
cache prefetching",
journal = j-TACO,
volume = "2",
number = "4",
pages = "423--450",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Calder:2006:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "3",
number = "1",
pages = "1--2",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Tan:2006:BSS,
author = "Lin Tan and Brett Brotherton and Timothy Sherwood",
title = "Bit-split string-matching engines for intrusion
detection and prevention",
journal = j-TACO,
volume = "3",
number = "1",
pages = "3--34",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Nagpurkar:2006:ERP,
author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
and Timothy Sherwood",
title = "Efficient remote profiling for resource-constrained
devices",
journal = j-TACO,
volume = "3",
number = "1",
pages = "35--66",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Lin:2006:RCG,
author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
Dz-Ching Ju and Tin-Fook Ngai",
title = "Recovery code generation for general speculative
optimizations",
journal = j-TACO,
volume = "3",
number = "1",
pages = "67--89",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Choi:2006:ORR,
author = "Yoonseo Choi and Hwansoo Han",
title = "Optimal register reassignment for register stack
overflow minimization",
journal = j-TACO,
volume = "3",
number = "1",
pages = "90--114",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Xue:2006:LOA,
author = "Jingling Xue and Qiong Cai",
title = "A lifetime optimal algorithm for speculative {PRE}",
journal = j-TACO,
volume = "3",
number = "2",
pages = "115--155",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sharkey:2006:IPT,
author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
Ghose and Oguz Ergin",
title = "Instruction packing: {Toward} fast and
energy-efficient instruction scheduling",
journal = j-TACO,
volume = "3",
number = "2",
pages = "156--181",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Ceze:2006:CUC,
author = "Luis Ceze and Karin Strauss and James Tuck and Josep
Torrellas and Jose Renau",
title = "{CAVA}: {Using} checkpoint-assisted value prediction
to hide {L2} misses",
journal = j-TACO,
volume = "3",
number = "2",
pages = "182--208",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2006:EAR,
author = "Lixin Zhang and Mike Parker and John Carter",
title = "Efficient address remapping in distributed
shared-memory systems",
journal = j-TACO,
volume = "3",
number = "2",
pages = "209--229",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhao:2006:ATP,
author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
title = "An approach toward profit-driven optimization",
journal = j-TACO,
volume = "3",
number = "3",
pages = "231--262",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162691",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "Although optimizations have been applied for a number
of years to improve the performance of software,
problems with respect to the application of
optimizations have not been adequately addressed. For
example, in certain circumstances, optimizations may
degrade performance. However, there is no efficient way
to know when a degradation will occur. In this
research, we investigate the profitability of
optimizations, which is useful for determining the
benefit of applying optimizations. We develop a
framework that enables us to predict profitability
using analytic models. The profitability of an
optimization depends on code context, the particular
optimization, and machine resources. Thus, our
framework has analytic models for each of these
components. As part of the framework, there is also a
profitability engine that uses models to predict the
profit. In this paper, we target scalar optimizations
and, in particular, describe the models for partial
redundancy elimination (PRE), loop invariant code
motion (LICM), and value numbering (VN). We implemented
the framework for predicting the profitability of these
optimizations. Based on the predictions, we can
selectively apply profitable optimizations. We compared
the profit-driven approach with an approach that uses a
heuristic in deciding when optimizations should be
applied. Our experiments demonstrate that the
profitability of scalar optimizations can be accurately
predicted by using models. That is, without actually
applying a scalar optimization, we can determine if an
optimization is beneficial and should be applied.",
acknowledgement = ack-nhfb,
}
@Article{Hazelwood:2006:MBC,
author = "Kim Hazelwood and Michael D. Smith",
title = "Managing bounded code caches in dynamic binary
optimization systems",
journal = j-TACO,
volume = "3",
number = "3",
pages = "263--294",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162692",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "Dynamic binary optimizers store altered copies of
original program instructions in software-managed code
caches in order to maximize reuse of transformed code.
Code caches store code blocks that may vary in size,
reference other code blocks, and carry a high
replacement overhead. These unique constraints reduce
the effectiveness of conventional cache management
policies. Our work directly addresses these unique
constraints and presents several contributions to the
code-cache management problem. First, we show that
evicting more than the minimum number of code blocks
from the code cache results in less run-time overhead
than the existing alternatives. Such granular evictions
reduce overall execution time, as the fixed costs of
invoking the eviction mechanism are amortized across
multiple cache insertions. Second, a study of the ideal
lifetimes of dynamically generated code blocks
illustrates the benefit of a replacement algorithm
based on a generational heuristic. We describe and
evaluate a generational approach to code cache
management that makes it easy to identify long-lived
code blocks and simultaneously avoid any fragmentation
because of the eviction of short-lived blocks. Finally,
we present results from an implementation of our
generational approach in the DynamoRIO framework and
illustrate that, as dynamic optimization systems become
more prevalent, effective code cache-management
policies will be essential for reliable, scalable
performance of modern applications.",
acknowledgement = ack-nhfb,
}
@Article{Rochecouste:2006:CCE,
author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
Seznec",
title = "A case for a complexity-effective, width-partitioned
microarchitecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "295--326",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162693",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "The analysis of program executions reveals that most
integer and multimedia applications make heavy use of
narrow-width operations, i.e., instructions exclusively
using narrow-width operands and producing a
narrow-width result. Moreover, this usage is relatively
well distributed over the application. We observed this
program property on the MediaBench and SPEC2000
benchmarks with about 40\% of the instructions being
narrow-width operations. Current superscalar processors
use 64-bit datapaths to execute all the instructions of
the applications. In this paper, we suggest the use of
a width-partitioned microarchitecture (WPM) to master
the hardware complexity of a superscalar processor. For
a four-way issue machine, we split the processor in two
two-way clusters: the main cluster executing 64-bit
operations, load/store, and complex operations and a
narrow cluster executing the 16-bit operations. We
resort to partitioning to decouple the treatment of the
narrow-width operations from that of the other program
instructions. This provides the benefit of greatly
simplifying the design of the critical processor
components in each cluster (e.g., the register file and
the bypass network). The dynamic interleaving of the
two instruction types allows maintaining the workload
balanced among clusters. WPM also helps to reduce the
complexity of the interconnection fabric and of the
issue logic. In fact, since the 16-bit cluster can only
communicate narrow-width data, the datapath-width of
the interconnect fabric can be significantly reduced,
yielding a corresponding saving of the interconnect
power and area. We explore different possible
configurations of WPM, discussing the various
implementation tradeoffs. We also examine a speculative
steering heuristic to distribute the narrow-width
operations among clusters. A detailed analysis of the
complexity factors shows using WPM instead of a
classical 64-bit two-cluster microarchitecture can save
power and silicon area with a minimal impact on the
overall performance.",
acknowledgement = ack-nhfb,
}
@Article{Zmily:2006:BAI,
author = "Ahmad Zmily and Christos Kozyrakis",
title = "Block-aware instruction set architecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "327--357",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162694",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "Instruction delivery is a critical component for
wide-issue, high-frequency processors since its
bandwidth and accuracy place an upper limit on
performance. The processor front-end accuracy and
bandwidth are limited by instruction-cache misses,
multicycle instruction-cache accesses, and target or
direction mispredictions for control-flow operations.
This paper presents a block-aware instruction set
(BLISS) that allows software to assist with front-end
challenges. BLISS defines basic block descriptors that
are stored separately from the actual instructions in a
program. We show that BLISS allows for a decoupled
front-end that tolerates instruction-cache latency,
facilitates instruction prefetching, and leads to
higher prediction accuracy.",
acknowledgement = ack-nhfb,
}
@Article{Crandall:2006:MAS,
author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
Chong",
title = "{Minos}: {Architectural} support for protecting
control data",
journal = j-TACO,
volume = "3",
number = "4",
pages = "359--389",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Marathe:2006:ACC,
author = "Jaydeep Marathe and Frank Mueller and Bronis R. de
Supinski",
title = "Analysis of cache-coherence bottlenecks with hybrid
hardware\slash software techniques",
journal = j-TACO,
volume = "3",
number = "4",
pages = "390--423",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Ganusov:2006:FEP,
author = "Ilya Ganusov and Martin Burtscher",
title = "Future execution: {A} prefetching mechanism that uses
multiple cores to speed up single threads",
journal = j-TACO,
volume = "3",
number = "4",
pages = "424--449",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Co:2006:ETC,
author = "Michele Co and Dee A. B. Weikle and Kevin Skadron",
title = "Evaluating trace cache energy efficiency",
journal = j-TACO,
volume = "3",
number = "4",
pages = "450--476",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Hu:2006:EMM,
author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
title = "Effective management of multiple configurable units
using dynamic optimization",
journal = j-TACO,
volume = "3",
number = "4",
pages = "477--501",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Bentley:2006:IAB,
author = "Chris Bentley and Scott A. Watterson and David K.
Lowenthal and Barry Rountree",
title = "Implicit array bounds checking on 64-bit
architectures",
journal = j-TACO,
volume = "3",
number = "4",
pages = "502--527",
month = dec,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1187976.1187982",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
abstract = "Several programming languages guarantee that array
subscripts are checked to ensure they are within the
bounds of the array. While this guarantee improves the
correctness and security of array-based code, it adds
overhead to array references. This has been an obstacle
to using higher-level languages, such as Java, for
high-performance parallel computing, where the language
specification requires that all array accesses must be
checked to ensure they are within bounds. This is
because, in practice, array-bounds checking in
scientific applications may increase execution time by
more than a factor of 2. Previous research has explored
optimizations to statically eliminate bounds checks,
but the dynamic nature of many scientific codes makes
this difficult or impossible. Our approach is, instead,
to create a compiler and operating system
infrastructure that does not generate explicit bounds
checks. It instead places arrays inside of Index
Confinement Regions (ICRs), which are large, isolated,
mostly unmapped virtual memory regions. Any array
reference outside of its bounds will cause a protection
violation; this provides implicit bounds checking. Our
results show that when applying this infrastructure to
high-performance computing programs written in Java,
the overhead of bounds checking relative to a program
with no bounds checks is reduced from an average of
63\% to an average of 9\%.",
acknowledgement = ack-nhfb,
}
@Article{Calder:2007:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "1",
}
@Article{Constantinides:2007:ARC,
author = "Kypros Constantinides and Stephen Plaza and Jason
Blome and Valeria Bertacco and Scott Mahlke and Todd
Austin and Bin Zhang and Michael Orshansky",
title = "Architecting a reliable {CMP} switch architecture",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "2",
}
@Article{Hwang:2007:SSA,
author = "Yuan-Shin Hwang and Jia-Jhe Li",
title = "Snug set-associative caches: {Reducing} leakage power
of instruction and data caches with no performance
penalties",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "6",
}
@Article{Luo:2007:CNP,
author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
title = "Conserving network processor power consumption by
exploiting traffic variability",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "4",
}
@Article{Rong:2007:SDS,
author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
Alban Douillet and Guang R. Gao",
title = "Single-dimension software pipelining for
multidimensional loops",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "7",
}
@Article{Sasanka:2007:AES,
author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
Yen-Kuang Chen and Eric Debes",
title = "{ALP}: {Efficient} support for all levels of
parallelism for complex media applications",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "3",
}
@Article{Soteriou:2007:SDP,
author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
title = "Software-directed power-aware interconnection
networks",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "5",
}
@Article{Bower:2007:ODH,
author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
title = "Online diagnosis of hard faults in microprocessors",
journal = j-TACO,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250728",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We develop a microprocessor design that tolerates hard
faults, including fabrication defects and in-field
faults, by leveraging existing microprocessor
redundancy. To do this, we must: detect and correct
errors, diagnose hard faults at the field
deconfigurable unit (FDU) granularity, and deconfigure
FDUs with hard faults. In our reliable microprocessor
design, we use DIVA dynamic verification to detect and
correct errors. Our new scheme for diagnosing hard
faults tracks instructions' core structure occupancy
from decode until commit. If a DIVA checker detects an
error in an instruction, it increments a small
saturating error counter for every FDU used by that
instruction, including that DIVA checker. A hard fault
in an FDU quickly leads to an above-threshold error
counter for that FDU and thus diagnoses the fault. For
deconfiguration, we use previously developed schemes
for functional units and buffers and present a scheme
for deconfiguring DIVA checkers. Experimental results
show that our reliable microprocessor quickly and
accurately diagnoses each hard fault that is injected
and continues to function, albeit with somewhat
degraded performance.",
acknowledgement = ack-nhfb,
articleno = "8",
keywords = "fine-grained diagnosis; hard fault tolerance;
processor microarchitecture",
}
@Article{Michaud:2007:STM,
author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
and Yiannakis Sazeides and Theofanis Constantinou",
title = "A study of thread migration in temperature-constrained
multicores",
journal = j-TACO,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250729",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Temperature has become an important constraint in
high-performance processors, especially multicores.
Thread migration will be essential to exploit the full
potential of future thermally constrained multicores.
We propose and study a thread migration method that
maximizes performance under a temperature constraint,
while minimizing the number of migrations and ensuring
fairness between threads. We show that thread migration
brings important performance gains and that it is most
effective during the first tens of seconds following a
decrease of the number of running threads.",
acknowledgement = ack-nhfb,
articleno = "9",
keywords = "multicore processor; power density; temperature;
thermal management; thread migration",
}
@Article{Chen:2007:CRL,
author = "Yu Chen and Fuxin Zhang",
title = "Code reordering on limited branch offset",
journal = j-TACO,
volume = "4",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250730",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Since the 1980's code reordering has gained popularity
as an important way to improve the spatial locality of
programs. While the effect of the processor's
microarchitecture and memory hierarchy on this
optimization technique has been investigated, little
research has focused on the impact of the instruction
set. In this paper, we analyze the effect of limited
branch offset of the MIPS-like instruction set [Hwu et
al. 2004, 2005] on code reordering, explore two simple
methods to handle the exceeded branches, and propose
the bidirectional code layout (BCL) algorithm to reduce
the number of branches exceeding the offset limit. The
BCL algorithm sorts the chains according to the
position of related chains, avoids cache conflict
misses deliberately and lays out the code
bidirectionally. It strikes a balance among the
distance of related blocks, the instruction cache miss
rate, the memory size required, and the control flow
transfer. Experimental results show that BCL can
effectively reduce exceeded branches by 50.1\%, on
average, with up to 100\% for some programs. Except for
some programs with little spatial locality, the BCL
algorithm can achieve the performance, as the case with
no branch offset limitation.",
acknowledgement = ack-nhfb,
articleno = "10",
keywords = "code reordering; Godson Processor; link-time
optimization",
}
@Article{Terechko:2007:ICC,
author = "A. S. Terechko and H. Corporaal",
title = "Inter-cluster communication in {VLIW} architectures",
journal = j-TACO,
volume = "4",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250731",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The traditional VLIW (very long instruction word)
architecture with a single register file does not scale
up well to address growing performance demands on
embedded media processors. However, splitting a VLIW
processor in smaller clusters, which are comprised of
function units fully connected to local register files,
can significantly improve VLSI implementation
characteristics of the processor, such as speed, energy
consumption, and area. In our paper we reveal that
achieving the best characteristics of a clustered VLIW
requires a thorough selection of an Inter-cluster
Communication (ICC) model, which is the way clustering
is exposed in the Instruction Set Architecture. For our
study we, first, define a taxonomy of ICC models
including copy operations, dedicated issue slots,
extended operands, extended results, and multicast.
Evaluation of the execution time of the models requires
both the dynamic cycle count and clock period. We
developed an advanced instruction scheduler for all the
five ICC models in order to quantify the dynamic cycle
counts of our multimedia C benchmarks. To assess the
clock period of the ICC models we designed and laid out
VLIW datapaths using the RTL hardware descriptions
derived from a deeply pipelined commercial TriMedia
processor. In contrast to prior art, our research shows
that fully distributed register file architectures
(with eight clusters in our study) often underperform
compared to moderately clustered machines with two or
four clusters because of explosion of the cycle count
overhead in the former. Among the evaluated ICC models,
performance of the copy operation model, popular both
in academia and industry, is severely limited by the
copy operations hampering scheduling of regular
operations in high ILP (instruction-level parallelism)
code. The dedicated issue slots model combats this
limitation by dedicating extra VLIW issue slots purely
for ICC, reaching the highest 1.74 execution time
speedup relative to the unicluster. Furthermore, our
VLSI experiments show that the lowest area and energy
consumption of 42 and 57\% relative to the unicluster,
respectively, are achieved by the extended operands
model, which, nevertheless, provides higher performance
than the copy operation model.",
acknowledgement = ack-nhfb,
articleno = "11",
keywords = "clock frequency; cluster assignment; instruction-level
parallelism; instruction scheduler; intercluster
communication; optimizing compiler; pipelining;
register allocation; VLIW",
}
@Article{Dou:2007:CCM,
author = "Jialin Dou and Marcelo Cintra",
title = "A compiler cost model for speculative
parallelization",
journal = j-TACO,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250732",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Speculative parallelization is a technique that allows
code sections that cannot be fully analyzed by the
compiler to be aggressively executed in parallel.
However, while speculative parallelization can
potentially deliver significant speedups, several
overheads associated with this technique can limit
these speedups in practice. This paper proposes a novel
compiler static cost model of speculative multithreaded
execution that can be used to predict the resulting
performance. This model attempts to predict the
expected speedups, or slowdowns, of the candidate
speculative sections based on the estimation of the
combined runtime effects of various overheads, and
taking into account the scheduling restrictions of most
speculative execution environments. The model is based
on estimating the likely execution duration of threads
and considers all the possible permutations of these
threads. This model also produces a quantitative
estimate of the speedup, which is different from prior
heuristics that only qualitatively estimate the
benefits of speculative multithreaded execution. In
previous work, a limited version of the framework was
evaluated on a number of loops from a collection of
SPEC benchmarks that suffer mainly from load imbalance
and thread dispatch and commit overheads. In this work,
an extended framework is also evaluated on loops that
may suffer from data-dependence violations.
Experimental results show that prediction accuracy is
lower when loops with violations are included.
Nevertheless, accuracy is still very high for a static
model: the framework can identify, on average, 45\% of
the loops that cause slowdowns and, on average, 96\% of
the loops that lead to speedups; it predicts the
speedups or slowdowns with an error of less than 20\%
for an average of 28\% of the loops across the
benchmarks and with an error of less than 50\% for an
average of 80\% of the loops. Overall, the framework
often outperforms, by as much as 25\%, a naive approach
that attempts to speculatively parallelize all the
loops considered, and is able to curb the large
slowdowns caused in many cases by this naive
approach.",
acknowledgement = ack-nhfb,
articleno = "12",
keywords = "speculative multithreading; speculative
parallelization; thread-level speculation",
}
@Article{Amme:2007:SBM,
author = "Wolfram Amme and Jeffery von Ronne and Michael Franz",
title = "{SSA}-based mobile code: {Implementation} and
empirical evaluation",
journal = j-TACO,
volume = "4",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250733",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Although one might expect transportation formats based
on static single-assignment form (SSA) to yield faster
just-in-time compilation times than those based on
stack-based virtual machines, this claim has not
previously been validated, in practice. We attempt to
quantify the effect of using an SSA-based mobile code
representation by integrating support for a verifiable
SSA-based IR into Jikes RVM. Performance results,
measured with various optimizations and on both the
IA32 and PowerPC, show improvements in both compilation
time and code quality.",
acknowledgement = ack-nhfb,
articleno = "13",
keywords = "SafeTSA; static single-assignment form; virtual
machines",
}
@Article{Li:2007:CCE,
author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
Yuanyuan Zhou",
title = "Cross-component energy management: {Joint} adaptation
of processor and memory",
journal = j-TACO,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275938",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Researchers have proposed the use of adaptation to
reduce the energy consumption of different hardware
components, such as the processor, memory, disk, and
display for general-purpose applications. Previous
algorithms to control these adaptations, however, have
focused on a single component. This work takes the
first step toward developing algorithms that can
jointly control adaptations in multiple interacting
components for general-purpose applications, with the
goal of minimizing the total energy consumed within a
specified performance loss. Specifically, we develop a
joint-adaptation algorithm for processor and memory
adaptations. We identify two properties that enable
per-component algorithms to be easily used in a
cross-component context---the algorithms' performance
impact must be guaranteed and composable. We then
modify a current processor and a memory algorithm to
obey these properties. This allows the cross-component
problem to be reduced to determine an appropriate
(energy-optimal) allocation of the target performance
loss (slack) between the two components. We develop
such an optimal slack allocation algorithm that
exploits the above properties. The result is an
efficient cross-component adaptation framework that
minimizes the total energy of the processor and memory
without exceeding the target performance loss, while
substantially leveraging current per-component
algorithms. Our experiments show that joint processor
and memory adaptation provides significantly more
energy savings than adapting either component alone;
intelligent slack distribution is specifically
effective for highly compute- or memory-intensive
applications; and the performance slowdown never
exceeds the specification.",
acknowledgement = ack-nhfb,
articleno = "14",
keywords = "adaptive systems; control algorithms; energy
management; low-power design; memory; performance
guarantee; processor",
}
@Article{Gabor:2007:FES,
author = "Ron Gabor and Shlomo Weiss and Avi Mendelson",
title = "Fairness enforcement in switch on event
multithreading",
journal = j-TACO,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275939",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The need to reduce power and complexity will increase
the interest in Switch On Event multithreading
(coarse-grained multithreading). Switch On Event
multithreading is a low-power and low-complexity
mechanism to improve processor throughput by switching
threads on execution stalls. Fairness may, however,
become a problem in a multithreaded processor. Unless
fairness is properly handled, some threads may starve
while others consume all of the processor cycles.
Heuristics that were devised in order to improve
fairness in simultaneous multithreading are not
applicable to Switch On Event multithreading. This
paper defines the fairness metric using the ratio of
the individual threads' speedups and shows how it can
be enforced in Switch On Event multithreading. Fairness
is controlled by forcing additional thread switch
points. These switch points are determined dynamically
by runtime estimation of the single threaded
performance of each of the individual threads. We
analyze the impact of the fairness enforcement
mechanism on aggregate IPC and weighted speedup. We
present simulation results of the performance of Switch
On Event multithreading. Switch On Event multithreading
achieves an average aggregate IPC increase of 26\% over
single thread and 12\% weighted speedup when no
fairness is enforced. In this case, a sixth of our runs
resulted in poor fairness in which one thread ran
extremely slowly (10 to 100 times slower than its
single-thread performance), while the other thread's
performance was hardly affected. By using the proposed
mechanism, we can guarantee fairness at different
levels of strictness and, in most cases, even improve
the weighted speedup.",
acknowledgement = ack-nhfb,
articleno = "15",
keywords = "coarse-grained multithreading; fairness;
multithreading; performance; SOE; Switch on Event
multithreading; throughput; weighted speedup",
}
@Article{Andrade:2007:PAA,
author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Precise automatable analytical modeling of the cache
behavior of codes with indirections",
journal = j-TACO,
volume = "4",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275940",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The performance of memory hierarchies, in which caches
play an essential role, is critical in nowadays
general-purpose and embedded computing systems because
of the growing memory bottleneck problem.
Unfortunately, cache behavior is very unstable and
difficult to predict. This is particularly true in the
presence of irregular access patterns, which exhibit
little locality. Such patterns are very common, for
example, in applications in which pointers or
compressed sparse matrices give place to indirections.
Nevertheless, cache behavior in the presence of
irregular access patterns has not been widely studied.
In this paper we present an extension of a systematic
analytical modeling technique based on PMEs
(probabilistic miss equations), previously developed by
the authors, that allows the automated analysis of the
cache behavior for codes with irregular access patterns
resulting from indirections. The model generates very
accurate predictions despite the irregularities and has
very low computing requirements, being the first model
that gathers these desirable characteristics that can
automatically analyze this kind of codes. These
properties enable this model to help drive compiler
optimizations, as we show with an example.",
acknowledgement = ack-nhfb,
articleno = "16",
keywords = "analytical modeling; irregular access patterns; memory
hierarchy; performance prediction",
}
@Article{Venstermans:2007:JOH,
author = "Kris Venstermans and Lieven Eeckhout and Koen De
Bosschere",
title = "{Java} object header elimination for reduced memory
consumption in 64-bit virtual machines",
journal = j-TACO,
volume = "4",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275941",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Memory performance is an important design issue for
contemporary computer systems given the huge
processor/memory speed gap. This paper proposes a
space-efficient Java object model for reducing the
memory consumption of 64-bit Java virtual machines. We
completely eliminate the object header through typed
virtual addressing (TVA) or implicit typing. TVA
encodes the object type in the object's virtual address
by allocating all objects of a given type in a
contiguous memory segment. This allows for removing the
type information as well as the status field from the
object header. Whenever type and status information is
needed, masking is applied to the object's virtual
address for obtaining an offset into type and status
information structures. Unlike previous work on
implicit typing, we apply TVA to a selected number of
frequently allocated object types, hence, the name
selective TVA (STVA); this limits the amount of memory
fragmentation. In addition to applying STVA, we also
compress the type information block (TIB) pointers for
all objects that do not fall under TVA. We implement
the space-efficient Java object model in the 64-bit
version of the Jikes RVM on an AIX IBM platform and
compare its performance against the traditionally used
Java object model using a multitude of Java benchmarks.
We conclude that the space-efficient Java object model
reduces memory consumption by on average 15\% (and up
to 45\% for some benchmarks). About one-half the
reduction comes from TIB pointer compression; the other
one-half comes from STVA. In terms of performance, the
space-efficient object model generally does not affect
performance; however, for some benchmarks we observe
statistically significant performance speedups, up to
20\%.",
acknowledgement = ack-nhfb,
articleno = "17",
keywords = "64-bit implementation; implicit typing; Java object
model; typed virtual addressing; Virtual machine",
}
@Article{Xiao:2007:VIS,
author = "Shu Xiao and Edmund M.-K. Lai",
title = "{VLIW} instruction scheduling for minimal power
variation",
journal = j-TACO,
volume = "4",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275942",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The focus of this paper is on the minimization of the
variation in power consumed by a VLIW processor during
the execution of a target program through instruction
scheduling. The problem is formulated as a
mixed-integer program (MIP) and a problem-specific
branch-and-bound algorithm has been developed to solve
it more efficiently than generic MIP solvers.
Simulation results based on the TMS320C6711 VLIW
digital signal processor using benchmarks from
Mediabench and Trimaran showed that over 40\% average
reduction in power variation can be achieved without
sacrificing execution speed of these benchmarks.
Computational requirements and convergence rates of our
algorithm are also analyzed.",
acknowledgement = ack-nhfb,
articleno = "18",
keywords = "instruction scheduling; power variation reduction;
VLIW processors",
}
@Article{Tallam:2007:UCF,
author = "Sriraman Tallam and Rajiv Gupta",
title = "Unified control flow and data dependence traces",
journal = j-TACO,
volume = "4",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275943",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We describe the design, generation, and compression of
the extended whole program path (eWPP), representation
that not only captures the control flow history of a
program execution but also its data dependence history.
This representation is motivated by the observation
that, typically, a significant fraction of data
dependence history can be recovered from the control
flow trace. To capture the remainder of the data
dependence history, we introduce disambiguation checks
in the program whose control flow signatures capture
the results of the checks. The resulting extended
control flow trace enables the recovery of otherwise
irrecoverable data dependences. The code for the checks
is designed to minimize the increase in program
execution time and the extended control flow trace size
when compared to directly collecting control flow and
address traces. Our experiments show that compressed
eWPPs are only one-quarter of the size of combined
compressed control flow and address traces. However,
their collection incurs a 5{\times} increase in runtime
overhead relative to the overhead required for directly
collecting the control flow and address traces,
respectively.",
acknowledgement = ack-nhfb,
articleno = "19",
keywords = "address trace; control flow trace; dynamic data
dependence trace; profiling",
}
@Article{Ipek:2008:EAD,
author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich
Caruana and Bronis R. de Supinski and Martin Schulz",
title = "Efficient architectural design space exploration via
predictive modeling",
journal = j-TACO,
volume = "4",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328196",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Efficiently exploring exponential-size architectural
design spaces with many interacting parameters remains
an open problem: the sheer number of experiments
required renders detailed simulation intractable. We
attack this via an automated approach that builds
accurate predictive models. We simulate sampled points,
using results to teach our models the function
describing relationships among design parameters. The
models can be queried and are very fast, enabling
efficient design tradeoff discovery. We validate our
approach via two uniprocessor sensitivity studies,
predicting IPC with only 1--2\% error. In an
experimental study using the approach, training on 1\%
of a 250-K-point CMP design space allows our models to
predict performance with only 4--5\% error. Our
predictive modeling combines well with techniques that
reduce the time taken by each simulation experiment,
achieving net time savings of three-four orders of
magnitude.",
acknowledgement = ack-nhfb,
articleno = "1",
keywords = "artificial neural networks; design space exploration;
performance prediction; sensitivity studies",
}
@Article{Shi:2008:VMS,
author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
Gregg",
title = "Virtual machine showdown: {Stack} versus registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328197",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Virtual machines (VMs) enable the distribution of
programs in an architecture-neutral format, which can
easily be interpreted or compiled. A long-running
question in the design of VMs is whether a stack
architecture or register architecture can be
implemented more efficiently with an interpreter. We
extend existing work on comparing virtual stack and
virtual register architectures in three ways. First,
our translation from stack to register code and
optimization are much more sophisticated. The result is
that we eliminate an average of more than 46\% of
executed VM instructions, with the bytecode size of the
register machine being only 26\% larger than that of
the corresponding stack one. Second, we present a fully
functional virtual-register implementation of the Java
virtual machine (JVM), which supports Intel, AMD64,
PowerPC and Alpha processors. This register VM supports
inline-threaded, direct-threaded, token-threaded, and
switch dispatch. Third, we present experimental results
on a range of additional optimizations such as register
allocation and elimination of redundant heap loads. On
the AMD64 architecture the register machine using
switch dispatch achieves an average speedup of 1.48
over the corresponding stack machine. Even using the
more efficient inline-threaded dispatch, the register
VM achieves a speedup of 1.15 over the equivalent
stack-based VM.",
acknowledgement = ack-nhfb,
articleno = "2",
keywords = "interpreter; register architecture; stack
architecture; virtual machine",
}
@Article{Yan:2008:EVR,
author = "Jun Yan and Wei Zhang",
title = "Exploiting virtual registers to reduce pressure on
real registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328198",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "It is well known that a large fraction of variables
are short-lived. This paper proposes a novel approach
to exploiting this fact to reduce the register pressure
for pipelined processors with data-forwarding network.
The idea is that the compiler can allocate virtual
registers (i.e., place holders to identify dependences
among instructions) to short-lived variables, which do
not need to be stored to physical storage locations. As
a result, real registers (i.e., physically existed
registers) can be reserved for long-lived variables for
mitigating the register pressure and decreasing the
register spills, leading to performance improvement. In
this paper, we develop the architectural and compiler
support for exploiting virtual registers for statically
scheduled processors. Our experimental results show
that virtual registers are very effective at reducing
the register spills, which, in many cases, can achieve
the performance close to the processor with twice
number of real registers. Our results also indicate
that, for some applications, using 24 virtual, in
addition to 8 real registers, can attain even higher
performance than that of 16 real without any virtual
registers.",
acknowledgement = ack-nhfb,
articleno = "3",
keywords = "data forwarding; register allocation; register file;
short-lived variables; virtual register",
}
@Article{Yu:2008:OCL,
author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
title = "Object co-location and memory reuse for {Java}
programs",
journal = j-TACO,
volume = "4",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328199",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We introduce a new memory management system, STEMA,
which can improve the execution time of Java programs.
STEMA detects prolific types on-the-fly and co-locates
their objects in a special memory space which supports
reuse of memory. We argue and show that memory reuse
and co-location of prolific objects can result in
improved cache locality, reduced memory fragmentation,
reduced GC time, and faster object allocation. We
evaluate STEMA using 16 benchmarks. Experimental
results show that STEMA performs 2.7\%, 4.0\%, and
8.2\% on average better than MarkSweep, CopyMS, and
SemiSpace.",
acknowledgement = ack-nhfb,
articleno = "4",
keywords = "garbage collector; Java; memory allocator; memory
reuse; mutator; object co-location",
}
@Article{Zhang:2008:RCM,
author = "Chuanjun Zhang",
title = "Reducing cache misses through programmable decoders",
journal = j-TACO,
volume = "4",
number = "4",
pages = "5:1--5:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328200",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Level-one caches normally reside on a processor's
critical path, which determines clock frequency.
Therefore, fast access to level-one cache is important.
Direct-mapped caches exhibit faster access time, but
poor hit rates, compared with same sized
set-associative caches because of nonuniform accesses
to the cache sets. The nonuniform accesses generate
more cache misses in some sets, while other sets are
underutilized. We propose to increase the decoder
length and, hence, reduce the accesses to heavily used
sets without dynamically detecting the cache set usage
information. We increase the access to the
underutilized cache sets by incorporating a replacement
policy into the cache design using programmable
decoders. On average, the proposed techniques achieve
as low a miss rate as a traditional 4-way cache on all
26 SPEC2K benchmarks for the instruction and data
caches, respectively. This translates into an average
IPC improvement of 21.5 and 42.4\% for SPEC2K integer
and floating-point benchmarks, respectively. The
B-Cache consumes 10.5\% more power per access, but
exhibits a 12\% total memory access-related energy
savings as a result of the miss rate reductions, and,
hence, the reduction to applications' execution time.
Compared with previous techniques that aim at reducing
the miss rate of direct-mapped caches, our technique
requires only one cycle to access all cache hits and
has the same access time of a direct-mapped cache.",
acknowledgement = ack-nhfb,
articleno = "5",
keywords = "cache; dynamic optimization; low power",
}
@Article{Golander:2008:HMP,
author = "Amit Golander and Shlomo Weiss",
title = "Hiding the misprediction penalty of a
resource-efficient high-performance processor",
journal = j-TACO,
volume = "4",
number = "4",
pages = "6:1--6:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328201",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Misprediction is a major obstacle for increasing
speculative out-of-order processors performance.
Performance degradation depends on both the number of
misprediction events and the recovery time associated
with each one of them. In recent years a few checkpoint
based microarchitectures have been proposed. In
comparison with ROB-based processors, checkpoint
processors are scalable and highly resource efficient.
Unfortunately, in these proposals the misprediction
recovery time is proportional to the instruction queue
size.\par
In this paper we analyze methods to reduce the
misprediction recovery time. We propose a new register
file management scheme and techniques to selectively
flush the instruction queue and the load store queue,
and to isolate deeply pipelined execution units. The
result is a novel checkpoint processor with Constant
misprediction RollBack time (CRB). We further present a
streamlined, cost-efficient solution, which saves
complexity at the price of slightly lower
performance.",
acknowledgement = ack-nhfb,
articleno = "6",
keywords = "checkpoints; misprediction; out-of-order execution;
rollback; scalable architecture",
}
@Article{Calder:2008:E,
author = "Brad Calder and Dean Tullsen",
title = "Editorial",
journal = j-TACO,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369397",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "1",
}
@Article{Mysore:2008:FIP,
author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
and Timothy Sherwood and Nisheeth Shrivastava and
Subhash Suri",
title = "Formulating and implementing profiling over adaptive
ranges",
journal = j-TACO,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369398",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Modern computer systems are called on to deal with
billions of events every second, whether they are
executed instructions, accessed memory locations, or
forwarded packets. This presents a serious challenge to
those who seek to quantify, analyze, or optimize such
systems, because important trends and behaviors may
easily be lost in a sea of data. We present
range-adaptive profiling (RAP) as a new and
general-purpose profiling method capable of
hierarchically efficiently classifying streams of data
in hardware. Through the use of RAP, events in an input
stream are dynamically classified into increasingly
precise categories, based on the frequency with which
they occur. The more important a class, or range of
events, the more precisely it is quantified. Despite
the dynamic nature of our technique, we build upon
tight theoretic bounds covering both worst-case error,
as well as the required memory. In the limit, it is
known that error and the memory bounds can be
independent of the stream size and grow only linearly
with the level of precision desired. Significantly, we
expose the critical constants in these algorithms and
through careful engineering, algorithm redesign, and
use of heuristics, we show how a high-performance
profile system can be implemented for range-adaptive
profiling. RAP can be used on various profiles, such as
PCs, load values, and memory addresses, and has a broad
range of uses, from hot-region profiling to quantifying
cache miss value locality. We propose two methods of
implementation of RAP, one in software and the other
with specialized hardware, for which we also describe
our prototype FPGA implementation. We show that with
just 8KB of memory, range profiles can be gathered with
an average accuracy of 98\%.",
acknowledgement = ack-nhfb,
articleno = "2",
keywords = "profiling hardware; range adaptive; value locality",
}
@Article{Zhai:2008:CHS,
author = "Antonia Zhai and J. Gregory Steffan and Christopher B.
Colohan and Todd C. Mowry",
title = "Compiler and hardware support for reducing the
synchronization of speculative threads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369399",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Thread-level speculation (TLS) allows us to
automatically parallelize general-purpose programs by
supporting parallel execution of threads that might not
actually be independent. In this article, we focus on
one important limitation of program performance under
TLS, which stalls as a result of synchronizing and
forwarding scalar values between speculative threads
that would otherwise cause frequent data dependences
and, hence, failed speculation. Using SPECint
benchmarks that have been automatically transformed by
our compiler to exploit TLS, we present, evaluate in
detail, and compare both compiler and hardware
techniques for improving the communication of scalar
values. We find that through our dataflow algorithms
for three increasingly aggressive instruction
scheduling techniques, the compiler can drastically
reduce the critical forwarding path introduced by the
synchronization and forwarding of scalar values. We
also show that hardware techniques for reducing
synchronization can be complementary to compiler
scheduling, but that the additional performance
benefits are minimal and are generally not worth the
cost.",
acknowledgement = ack-nhfb,
articleno = "3",
keywords = "automatic parallelization; chip-multiprocessing;
instruction scheduling; thread-level speculation",
}
@Article{Winter:2008:ATN,
author = "Jonathan A. Winter and David H. Albonesi",
title = "Addressing thermal nonuniformity in {SMT} workloads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369400",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We explore DTM techniques within the context of
uniform and nonuniform SMT workloads. While DVS is
suitable for addressing workloads with uniformly high
temperatures, for nonuniform workloads, performance
loss occurs because of the slowdown of the cooler
thread. To address this, we propose and evaluate DTM
mechanisms that exploit the steering-based thread
management mechanisms inherent in a clustered SMT
architecture. We show that in contrast to DVS, which
operates globally, our techniques are more effective at
controlling temperature for nonuniform workloads.
Furthermore, we devise a DTM technique that combines
steering and DVS to achieve consistently good
performance across all workloads.",
acknowledgement = ack-nhfb,
articleno = "4",
keywords = "adaptive microarchitectures; clustered
microarchitectures; dynamic thermal management; dynamic
voltage scaling; simultaneous multithreading",
}
@Article{Shahbahrami:2008:VES,
author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis
Vassiliadis",
title = "Versatility of extended subwords and the matrix
register file",
journal = j-TACO,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369401",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Extended subwords and the matrix register file (MRF)
are two micro architectural techniques that address
some of the limitations of existing SIMD architectures.
Extended subwords are wider than the data stored in
memory. Specifically, for every byte of data stored in
memory, there are four extra bits in the media register
file. This avoids the need for data-type conversion
instructions. The MRF is a register file organization
that provides both conventional row-wise, as well as
column-wise, access to the register file. In other
words, it allows to view the register file as a matrix
in which corresponding subwords in different registers
corresponds to a column of the matrix. It was
introduced to accelerate matrix transposition which is
a very common operation in multimedia applications. In
this paper, we show that the MRF is very versatile,
since it can also be used for other permutations than
matrix transposition. Specifically, it is shown how it
can be used to provide efficient access to strided
data, as is needed in, e.g., color space conversion.
Furthermore, it is shown that special-purpose
instructions (SPIs), such as the sum-of-absolute
differences (SAD) instruction, have limited usefulness
when extended subwords and a few general SIMD
instructions that we propose are supported, for the
following reasons. First, when extended subwords are
supported, the SAD instruction provides only a
relatively small performance improvement. Second, the
SAD instruction processes 8-bit subwords only, which is
not sufficient for quarter-pixel resolution nor for
cost functions used in image and video retrieval.
Results obtained by extending the SimpleScalar toolset
show that the proposed techniques provide a speedup of
up to 3.00 over the MMX architecture. The results also
show that using, at most, 13 extra media registers
yields an additional performance improvement ranging
from 1.38 to 1.57.",
acknowledgement = ack-nhfb,
articleno = "5",
keywords = "multimedia standards; SIMD architectures; SIMD
programming",
}
@Article{Guo:2008:EHC,
author = "Zhi Guo and Walid Najjar and Betul Buyukkurt",
title = "Efficient hardware code generation for {FPGAs}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369402",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The wider acceptance of FPGAs as a computing device
requires a higher level of programming abstraction.
ROCCC is an optimizing C to HDL compiler. We describe
the code generation approach in ROCCC. The smart buffer
is a component that reuses input data between adjacent
iterations. It significantly improves the performance
of the circuit and simplifies loop control. The
ROCCC-generated datapath can execute one loop iteration
per clock cycle when there is no loop dependency or
there is only scalar recurrence variable dependency.
ROCCC's approach to supporting while-loops operating on
scalars makes the compiler able to move scalar
iterative computation into hardware.",
acknowledgement = ack-nhfb,
articleno = "6",
keywords = "data reuse; FPGA; high-level synthesis; reconfigurable
computing; VHDL",
}
@Article{Kotzmann:2008:DJH,
author = "Thomas Kotzmann and Christian Wimmer and Hanspeter
M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
Russell and David Cox",
title = "Design of the {Java HotSpot\TM} client compiler for
{Java 6}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "7:1--7:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1370017",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
ships with a redesigned version of the client
just-in-time compiler that includes several research
results of the last years. The client compiler is at
the heart of the VM configuration used by default for
interactive desktop applications. For such
applications, low startup and pause times are more
important than peak performance. This paper outlines
the new architecture of the client compiler and shows
how it interacts with the VM. It presents the
intermediate representation that now uses static
single-assignment (SSA) form and the linear scan
algorithm for global register allocation. Efficient
support for exception handling and deoptimization
fulfills the demands that are imposed by the dynamic
features of the Java programming language. The
evaluation shows that the new client compiler generates
better code in less time. The popular SPECjvm98
benchmark suite is executed 45\% faster, while the
compilation speed is also up to 40\% better. This
indicates that a carefully selected set of global
optimizations can also be integrated in just-in-time
compilers that focus on compilation speed and not on
peak performance. In addition, the paper presents the
impact of several optimizations on execution and
compilation speed. As the source code is freely
available, the Java HotSpot{\TM} VM and the client
compiler are the ideal basis for experiments with new
feedback-directed optimizations in a production-level
Java just-in-time compiler. The paper outlines research
projects that add fast algorithms for escape analysis,
automatic object inlining, and array bounds check
elimination.",
acknowledgement = ack-nhfb,
articleno = "7",
keywords = "compiler; deoptimization; intermediate representation;
Java; just-in-time compilation; optimization; register
allocation",
}
@Article{Rangan:2008:PSD,
author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
and David I. August",
title = "Performance scalability of decoupled software
pipelining",
journal = j-TACO,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1400112.1400113",
ISSN = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Any successful solution to using multicore processors
to scale general-purpose program performance will have
to contend with rising intercore communication costs
while exposing coarse-grained parallelism. Recently
proposed pipelined multithreading (PMT) techniques have
been demonstrated to have general-purpose applicability
and are also able to effectively tolerate inter-core
latencies through pipelined interthread communication.
T