%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.14", %%% date = "28 August 2008", %%% time = "13:26:22 MDT", %%% filename = "taco.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "59331 2653 13011 118493", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Architecture and Code %%% Optimization; bibliography; TACO", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Architecture and Code %%% Optimization (CODEN ????, ISSN 1544-3566), %%% covering all journal issues from 2004 -- %%% date. %%% %%% At version 1.14, the COMPLETE journal %%% coverage looked like this: %%% %%% 2004 ( 17) 2006 ( 19) 2008 ( 17) %%% 2005 ( 17) 2007 ( 19) %%% %%% Article: 89 %%% %%% Total entries: 89 %%% %%% The journal Web page can be found at: %%% %%% http://www.acm.org/pubs/taco.html %%% %%% The journal table of contents page is at: %%% %%% http://www.acm.org/taco/ %%% http://portal.acm.org/browse_dl.cfm?linked=1&part=transaction&idx=J924 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TACO = "ACM Transactions on Architecture and Code Optimization"} %%% ==================================================================== %%% Bibliography entries: @Article{Calder:2004:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "1", number = "1", pages = "1--2", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2004:RIC, author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir and N. Vijaykrishnan and M. J. Irwin", title = "Reducing instruction cache energy consumption using a compiler-based strategy", journal = j-TACO, volume = "1", number = "1", pages = "3--33", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Isailovic:2004:DCQ, author = "Nemanja Isailovic and Mark Whitney and Yatish Patel and John Kubiatowicz and Dean Copsey and Frederic T. Chong and Isaac L. Chuang and Mark Oskin", title = "Datapath and control for quantum wires", journal = j-TACO, volume = "1", number = "1", pages = "34--61", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sankaralingam:2004:TPA, author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya Ranganathan and Doug Burger and Stephen W. Keckler and Robert G. McDonald and Charles R. Moore", title = "{TRIPS}: {A} polymorphous architecture for exploiting {ILP}, {TLP}, and {DLP}", journal = j-TACO, volume = "1", number = "1", pages = "62--93", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Skadron:2004:TAM, author = "Kevin Skadron and Mircea R. Stan and Karthik Sankaranarayanan and Wei Huang and Sivakumar Velusamy and David Tarjan", title = "Temperature-aware microarchitecture: {Modeling} and implementation", journal = j-TACO, volume = "1", number = "1", pages = "94--125", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Aleta:2004:RCC, author = "Alex Alet{\`a} and Josep M. Codina and Antonio Gonz{\'a}lez and David Kaeli", title = "Removing communications in clustered microarchitectures through instruction replication", journal = j-TACO, volume = "1", number = "2", pages = "127--151", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bai:2004:LPO, author = "Yu Bai and R. Iris Bahar", title = "A low-power in-order\slash out-of-order issue queue", journal = j-TACO, volume = "1", number = "2", pages = "152--179", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Juang:2004:IBP, author = "Philo Juang and Kevin Skadron and Margaret Martonosi and Zhigang Hu and Douglas W. Clark and Philip W. Diodato and Stefanos Kaxiras", title = "Implementing branch-predictor decay using quasi-static memory cells", journal = j-TACO, volume = "1", number = "2", pages = "180--219", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Santana:2004:LCF, author = "Oliverio J. Santana and Alex Ramirez and Josep L. Larriba-Pey and Mateo Valero", title = "A low-complexity fetch architecture for high-performance superscalar processors", journal = j-TACO, volume = "1", number = "2", pages = "220--245", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lin:2004:CFS, author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun Chan", title = "A compiler framework for speculative optimizations", journal = j-TACO, volume = "1", number = "3", pages = "247--271", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Fields:2004:ICS, author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill and Chris J. Newburn", title = "Interaction cost and shotgun profiling", journal = j-TACO, volume = "1", number = "3", pages = "272--304", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sankaranarayanan:2004:PBA, author = "Karthik Sankaranarayanan and Kevin Skadron", title = "Profile-based adaptation for cache decay", journal = j-TACO, volume = "1", number = "3", pages = "305--322", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Xie:2004:IDV, author = "Fen Xie and Margaret Martonosi and Sharad Malik", title = "Intraprogram dynamic voltage scaling: {Bounding} opportunities with analytic modeling", journal = j-TACO, volume = "1", number = "3", pages = "323--367", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hartstein:2004:OPD, author = "A. Hartstein and Thomas R. Puzak", title = "The optimum pipeline depth considering both power and performance", journal = j-TACO, volume = "1", number = "4", pages = "369--388", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Cristal:2004:TKI, author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo Valero and Jos{\'e} F. Mart{\'\i}nez", title = "Toward kilo-instruction processors", journal = j-TACO, volume = "1", number = "4", pages = "389--417", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Akkary:2004:ARE, author = "Haitham Akkary and Ravi Rajwar and Srikanth T. Srinivasan", title = "An analysis of a resource efficient checkpoint architecture", journal = j-TACO, volume = "1", number = "4", pages = "418--444", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Yang:2004:TML, author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng and Chien-Hao Lee", title = "Tolerating memory latency through push prefetching for pointer-intensive applications", journal = j-TACO, volume = "1", number = "4", pages = "445--475", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Calder:2005:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "2", number = "1", pages = "1--2", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhou:2005:EFA, author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu and Josep Torrellas", title = "Efficient and flexible architectural support for dynamic monitoring", journal = j-TACO, volume = "2", number = "1", pages = "3--33", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:WHC, author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid Najjar", title = "A way-halting cache for low-energy high-performance systems", journal = j-TACO, volume = "2", number = "1", pages = "34--54", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Abella:2005:ISP, author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera and Michael F. P. O'Boyle", title = "{IATAC}: a smart predictor to turn-off {L2} cache lines", journal = j-TACO, volume = "2", number = "1", pages = "55--77", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Haskins:2005:AWS, author = "John W. {Haskins, Jr.} and Kevin Skadron", title = "Accelerated warmup for sampled microarchitecture simulation", journal = j-TACO, volume = "2", number = "1", pages = "78--108", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2005:ABT, author = "Tao Li and Ravi Bhargava and Lizy Kurian John", title = "Adapting branch-target buffer to improve the target predictability of {Java} code", journal = j-TACO, volume = "2", number = "2", pages = "109--130", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:DIE, author = "Lingli Zhang and Chandra Krintz", title = "The design, implementation, and evaluation of adaptive code unloading for resource-constrained devices", journal = j-TACO, volume = "2", number = "2", pages = "131--164", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kulkarni:2005:FES, author = "Prasad A. Kulkarni and Stephen R. Hines and David B. Whalley and Jason D. Hiser and Jack W. Davidson and Douglas L. Jones", title = "Fast and efficient searches for effective optimization-phase sequences", journal = j-TACO, volume = "2", number = "2", pages = "165--198", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Salami:2005:DMI, author = "Esther Salam{\'\i} and Mateo Valero", title = "Dynamic memory interval test vs. interprocedural pointer analysis in multimedia applications", journal = j-TACO, volume = "2", number = "2", pages = "199--219", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Meng:2005:ELL, author = "Yan Meng and Timothy Sherwood and Ryan Kastner", title = "Exploring the limits of leakage power reduction in caches", journal = j-TACO, volume = "2", number = "3", pages = "221--246", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Garzaran:2005:TBS, author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor Vi{\~n}als and Lawrence Rauchwerger and Josep Torrellas", title = "Tradeoffs in buffering speculative memory state for thread-level speculation in multiprocessors", journal = j-TACO, volume = "2", number = "3", pages = "247--279", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Tarjan:2005:MPG, author = "David Tarjan and Kevin Skadron", title = "Merging path and gshare indexing in perceptron branch prediction", journal = j-TACO, volume = "2", number = "3", pages = "280--300", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:WET, author = "Xiangyu Zhang and Rajiv Gupta", title = "Whole execution traces and their applications", journal = j-TACO, volume = "2", number = "3", pages = "301--334", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhao:2005:IWA, author = "Wankang Zhao and David Whalley and Christopher Healy and Frank Mueller", title = "Improving {WCET} by applying a {WC} code-positioning optimization", journal = j-TACO, volume = "2", number = "4", pages = "335--365", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, keywords = "WCET (worst case execution time); WC (worst case)", } @Article{Reis:2005:SCF, author = "George A. Reis and Jonathan Chang and Neil Vachharajani and Ram Rangan and David I. August and Shubhendu S. Mukherjee", title = "Software-controlled fault tolerance", journal = j-TACO, volume = "2", number = "4", pages = "366--396", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2005:PPC, author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez", title = "Power-performance considerations of parallel computing on chip multiprocessors", journal = j-TACO, volume = "2", number = "4", pages = "397--422", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sharma:2005:SPE, author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte", title = "Spectral prefetcher: {An} effective mechanism for {L2} cache prefetching", journal = j-TACO, volume = "2", number = "4", pages = "423--450", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Calder:2006:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "3", number = "1", pages = "1--2", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Tan:2006:BSS, author = "Lin Tan and Brett Brotherton and Timothy Sherwood", title = "Bit-split string-matching engines for intrusion detection and prevention", journal = j-TACO, volume = "3", number = "1", pages = "3--34", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nagpurkar:2006:ERP, author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz and Timothy Sherwood", title = "Efficient remote profiling for resource-constrained devices", journal = j-TACO, volume = "3", number = "1", pages = "35--66", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lin:2006:RCG, author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy Dz-Ching Ju and Tin-Fook Ngai", title = "Recovery code generation for general speculative optimizations", journal = j-TACO, volume = "3", number = "1", pages = "67--89", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Choi:2006:ORR, author = "Yoonseo Choi and Hwansoo Han", title = "Optimal register reassignment for register stack overflow minimization", journal = j-TACO, volume = "3", number = "1", pages = "90--114", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Xue:2006:LOA, author = "Jingling Xue and Qiong Cai", title = "A lifetime optimal algorithm for speculative {PRE}", journal = j-TACO, volume = "3", number = "2", pages = "115--155", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sharkey:2006:IPT, author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad Ghose and Oguz Ergin", title = "Instruction packing: {Toward} fast and energy-efficient instruction scheduling", journal = j-TACO, volume = "3", number = "2", pages = "156--181", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ceze:2006:CUC, author = "Luis Ceze and Karin Strauss and James Tuck and Josep Torrellas and Jose Renau", title = "{CAVA}: {Using} checkpoint-assisted value prediction to hide {L2} misses", journal = j-TACO, volume = "3", number = "2", pages = "182--208", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2006:EAR, author = "Lixin Zhang and Mike Parker and John Carter", title = "Efficient address remapping in distributed shared-memory systems", journal = j-TACO, volume = "3", number = "2", pages = "209--229", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhao:2006:ATP, author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa", title = "An approach toward profit-driven optimization", journal = j-TACO, volume = "3", number = "3", pages = "231--262", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162691", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Although optimizations have been applied for a number of years to improve the performance of software, problems with respect to the application of optimizations have not been adequately addressed. For example, in certain circumstances, optimizations may degrade performance. However, there is no efficient way to know when a degradation will occur. In this research, we investigate the profitability of optimizations, which is useful for determining the benefit of applying optimizations. We develop a framework that enables us to predict profitability using analytic models. The profitability of an optimization depends on code context, the particular optimization, and machine resources. Thus, our framework has analytic models for each of these components. As part of the framework, there is also a profitability engine that uses models to predict the profit. In this paper, we target scalar optimizations and, in particular, describe the models for partial redundancy elimination (PRE), loop invariant code motion (LICM), and value numbering (VN). We implemented the framework for predicting the profitability of these optimizations. Based on the predictions, we can selectively apply profitable optimizations. We compared the profit-driven approach with an approach that uses a heuristic in deciding when optimizations should be applied. Our experiments demonstrate that the profitability of scalar optimizations can be accurately predicted by using models. That is, without actually applying a scalar optimization, we can determine if an optimization is beneficial and should be applied.", acknowledgement = ack-nhfb, } @Article{Hazelwood:2006:MBC, author = "Kim Hazelwood and Michael D. Smith", title = "Managing bounded code caches in dynamic binary optimization systems", journal = j-TACO, volume = "3", number = "3", pages = "263--294", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162692", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Dynamic binary optimizers store altered copies of original program instructions in software-managed code caches in order to maximize reuse of transformed code. Code caches store code blocks that may vary in size, reference other code blocks, and carry a high replacement overhead. These unique constraints reduce the effectiveness of conventional cache management policies. Our work directly addresses these unique constraints and presents several contributions to the code-cache management problem. First, we show that evicting more than the minimum number of code blocks from the code cache results in less run-time overhead than the existing alternatives. Such granular evictions reduce overall execution time, as the fixed costs of invoking the eviction mechanism are amortized across multiple cache insertions. Second, a study of the ideal lifetimes of dynamically generated code blocks illustrates the benefit of a replacement algorithm based on a generational heuristic. We describe and evaluate a generational approach to code cache management that makes it easy to identify long-lived code blocks and simultaneously avoid any fragmentation because of the eviction of short-lived blocks. Finally, we present results from an implementation of our generational approach in the DynamoRIO framework and illustrate that, as dynamic optimization systems become more prevalent, effective code cache-management policies will be essential for reliable, scalable performance of modern applications.", acknowledgement = ack-nhfb, } @Article{Rochecouste:2006:CCE, author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e} Seznec", title = "A case for a complexity-effective, width-partitioned microarchitecture", journal = j-TACO, volume = "3", number = "3", pages = "295--326", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162693", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "The analysis of program executions reveals that most integer and multimedia applications make heavy use of narrow-width operations, i.e., instructions exclusively using narrow-width operands and producing a narrow-width result. Moreover, this usage is relatively well distributed over the application. We observed this program property on the MediaBench and SPEC2000 benchmarks with about 40\% of the instructions being narrow-width operations. Current superscalar processors use 64-bit datapaths to execute all the instructions of the applications. In this paper, we suggest the use of a width-partitioned microarchitecture (WPM) to master the hardware complexity of a superscalar processor. For a four-way issue machine, we split the processor in two two-way clusters: the main cluster executing 64-bit operations, load/store, and complex operations and a narrow cluster executing the 16-bit operations. We resort to partitioning to decouple the treatment of the narrow-width operations from that of the other program instructions. This provides the benefit of greatly simplifying the design of the critical processor components in each cluster (e.g., the register file and the bypass network). The dynamic interleaving of the two instruction types allows maintaining the workload balanced among clusters. WPM also helps to reduce the complexity of the interconnection fabric and of the issue logic. In fact, since the 16-bit cluster can only communicate narrow-width data, the datapath-width of the interconnect fabric can be significantly reduced, yielding a corresponding saving of the interconnect power and area. We explore different possible configurations of WPM, discussing the various implementation tradeoffs. We also examine a speculative steering heuristic to distribute the narrow-width operations among clusters. A detailed analysis of the complexity factors shows using WPM instead of a classical 64-bit two-cluster microarchitecture can save power and silicon area with a minimal impact on the overall performance.", acknowledgement = ack-nhfb, } @Article{Zmily:2006:BAI, author = "Ahmad Zmily and Christos Kozyrakis", title = "Block-aware instruction set architecture", journal = j-TACO, volume = "3", number = "3", pages = "327--357", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162694", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Instruction delivery is a critical component for wide-issue, high-frequency processors since its bandwidth and accuracy place an upper limit on performance. The processor front-end accuracy and bandwidth are limited by instruction-cache misses, multicycle instruction-cache accesses, and target or direction mispredictions for control-flow operations. This paper presents a block-aware instruction set (BLISS) that allows software to assist with front-end challenges. BLISS defines basic block descriptors that are stored separately from the actual instructions in a program. We show that BLISS allows for a decoupled front-end that tolerates instruction-cache latency, facilitates instruction prefetching, and leads to higher prediction accuracy.", acknowledgement = ack-nhfb, } @Article{Crandall:2006:MAS, author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T. Chong", title = "{Minos}: {Architectural} support for protecting control data", journal = j-TACO, volume = "3", number = "4", pages = "359--389", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Marathe:2006:ACC, author = "Jaydeep Marathe and Frank Mueller and Bronis R. de Supinski", title = "Analysis of cache-coherence bottlenecks with hybrid hardware\slash software techniques", journal = j-TACO, volume = "3", number = "4", pages = "390--423", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ganusov:2006:FEP, author = "Ilya Ganusov and Martin Burtscher", title = "Future execution: {A} prefetching mechanism that uses multiple cores to speed up single threads", journal = j-TACO, volume = "3", number = "4", pages = "424--449", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Co:2006:ETC, author = "Michele Co and Dee A. B. Weikle and Kevin Skadron", title = "Evaluating trace cache energy efficiency", journal = j-TACO, volume = "3", number = "4", pages = "450--476", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hu:2006:EMM, author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John", title = "Effective management of multiple configurable units using dynamic optimization", journal = j-TACO, volume = "3", number = "4", pages = "477--501", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bentley:2006:IAB, author = "Chris Bentley and Scott A. Watterson and David K. Lowenthal and Barry Rountree", title = "Implicit array bounds checking on 64-bit architectures", journal = j-TACO, volume = "3", number = "4", pages = "502--527", month = dec, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1187976.1187982", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", abstract = "Several programming languages guarantee that array subscripts are checked to ensure they are within the bounds of the array. While this guarantee improves the correctness and security of array-based code, it adds overhead to array references. This has been an obstacle to using higher-level languages, such as Java, for high-performance parallel computing, where the language specification requires that all array accesses must be checked to ensure they are within bounds. This is because, in practice, array-bounds checking in scientific applications may increase execution time by more than a factor of 2. Previous research has explored optimizations to statically eliminate bounds checks, but the dynamic nature of many scientific codes makes this difficult or impossible. Our approach is, instead, to create a compiler and operating system infrastructure that does not generate explicit bounds checks. It instead places arrays inside of Index Confinement Regions (ICRs), which are large, isolated, mostly unmapped virtual memory regions. Any array reference outside of its bounds will cause a protection violation; this provides implicit bounds checking. Our results show that when applying this infrastructure to high-performance computing programs written in Java, the overhead of bounds checking relative to a program with no bounds checks is reduced from an average of 63\% to an average of 9\%.", acknowledgement = ack-nhfb, } @Article{Calder:2007:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "1", } @Article{Constantinides:2007:ARC, author = "Kypros Constantinides and Stephen Plaza and Jason Blome and Valeria Bertacco and Scott Mahlke and Todd Austin and Bin Zhang and Michael Orshansky", title = "Architecting a reliable {CMP} switch architecture", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "2", } @Article{Hwang:2007:SSA, author = "Yuan-Shin Hwang and Jia-Jhe Li", title = "Snug set-associative caches: {Reducing} leakage power of instruction and data caches with no performance penalties", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "6", } @Article{Luo:2007:CNP, author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan", title = "Conserving network processor power consumption by exploiting traffic variability", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "4", } @Article{Rong:2007:SDS, author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and Alban Douillet and Guang R. Gao", title = "Single-dimension software pipelining for multidimensional loops", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "7", } @Article{Sasanka:2007:AES, author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and Yen-Kuang Chen and Eric Debes", title = "{ALP}: {Efficient} support for all levels of parallelism for complex media applications", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "3", } @Article{Soteriou:2007:SDP, author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh", title = "Software-directed power-aware interconnection networks", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "5", } @Article{Bower:2007:ODH, author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev", title = "Online diagnosis of hard faults in microprocessors", journal = j-TACO, volume = "4", number = "2", pages = "8:1--8:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250728", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We develop a microprocessor design that tolerates hard faults, including fabrication defects and in-field faults, by leveraging existing microprocessor redundancy. To do this, we must: detect and correct errors, diagnose hard faults at the field deconfigurable unit (FDU) granularity, and deconfigure FDUs with hard faults. In our reliable microprocessor design, we use DIVA dynamic verification to detect and correct errors. Our new scheme for diagnosing hard faults tracks instructions' core structure occupancy from decode until commit. If a DIVA checker detects an error in an instruction, it increments a small saturating error counter for every FDU used by that instruction, including that DIVA checker. A hard fault in an FDU quickly leads to an above-threshold error counter for that FDU and thus diagnoses the fault. For deconfiguration, we use previously developed schemes for functional units and buffers and present a scheme for deconfiguring DIVA checkers. Experimental results show that our reliable microprocessor quickly and accurately diagnoses each hard fault that is injected and continues to function, albeit with somewhat degraded performance.", acknowledgement = ack-nhfb, articleno = "8", keywords = "fine-grained diagnosis; hard fault tolerance; processor microarchitecture", } @Article{Michaud:2007:STM, author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis and Yiannakis Sazeides and Theofanis Constantinou", title = "A study of thread migration in temperature-constrained multicores", journal = j-TACO, volume = "4", number = "2", pages = "9:1--9:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250729", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Temperature has become an important constraint in high-performance processors, especially multicores. Thread migration will be essential to exploit the full potential of future thermally constrained multicores. We propose and study a thread migration method that maximizes performance under a temperature constraint, while minimizing the number of migrations and ensuring fairness between threads. We show that thread migration brings important performance gains and that it is most effective during the first tens of seconds following a decrease of the number of running threads.", acknowledgement = ack-nhfb, articleno = "9", keywords = "multicore processor; power density; temperature; thermal management; thread migration", } @Article{Chen:2007:CRL, author = "Yu Chen and Fuxin Zhang", title = "Code reordering on limited branch offset", journal = j-TACO, volume = "4", number = "2", pages = "10:1--10:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250730", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Since the 1980's code reordering has gained popularity as an important way to improve the spatial locality of programs. While the effect of the processor's microarchitecture and memory hierarchy on this optimization technique has been investigated, little research has focused on the impact of the instruction set. In this paper, we analyze the effect of limited branch offset of the MIPS-like instruction set [Hwu et al. 2004, 2005] on code reordering, explore two simple methods to handle the exceeded branches, and propose the bidirectional code layout (BCL) algorithm to reduce the number of branches exceeding the offset limit. The BCL algorithm sorts the chains according to the position of related chains, avoids cache conflict misses deliberately and lays out the code bidirectionally. It strikes a balance among the distance of related blocks, the instruction cache miss rate, the memory size required, and the control flow transfer. Experimental results show that BCL can effectively reduce exceeded branches by 50.1\%, on average, with up to 100\% for some programs. Except for some programs with little spatial locality, the BCL algorithm can achieve the performance, as the case with no branch offset limitation.", acknowledgement = ack-nhfb, articleno = "10", keywords = "code reordering; Godson Processor; link-time optimization", } @Article{Terechko:2007:ICC, author = "A. S. Terechko and H. Corporaal", title = "Inter-cluster communication in {VLIW} architectures", journal = j-TACO, volume = "4", number = "2", pages = "11:1--11:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250731", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The traditional VLIW (very long instruction word) architecture with a single register file does not scale up well to address growing performance demands on embedded media processors. However, splitting a VLIW processor in smaller clusters, which are comprised of function units fully connected to local register files, can significantly improve VLSI implementation characteristics of the processor, such as speed, energy consumption, and area. In our paper we reveal that achieving the best characteristics of a clustered VLIW requires a thorough selection of an Inter-cluster Communication (ICC) model, which is the way clustering is exposed in the Instruction Set Architecture. For our study we, first, define a taxonomy of ICC models including copy operations, dedicated issue slots, extended operands, extended results, and multicast. Evaluation of the execution time of the models requires both the dynamic cycle count and clock period. We developed an advanced instruction scheduler for all the five ICC models in order to quantify the dynamic cycle counts of our multimedia C benchmarks. To assess the clock period of the ICC models we designed and laid out VLIW datapaths using the RTL hardware descriptions derived from a deeply pipelined commercial TriMedia processor. In contrast to prior art, our research shows that fully distributed register file architectures (with eight clusters in our study) often underperform compared to moderately clustered machines with two or four clusters because of explosion of the cycle count overhead in the former. Among the evaluated ICC models, performance of the copy operation model, popular both in academia and industry, is severely limited by the copy operations hampering scheduling of regular operations in high ILP (instruction-level parallelism) code. The dedicated issue slots model combats this limitation by dedicating extra VLIW issue slots purely for ICC, reaching the highest 1.74 execution time speedup relative to the unicluster. Furthermore, our VLSI experiments show that the lowest area and energy consumption of 42 and 57\% relative to the unicluster, respectively, are achieved by the extended operands model, which, nevertheless, provides higher performance than the copy operation model.", acknowledgement = ack-nhfb, articleno = "11", keywords = "clock frequency; cluster assignment; instruction-level parallelism; instruction scheduler; intercluster communication; optimizing compiler; pipelining; register allocation; VLIW", } @Article{Dou:2007:CCM, author = "Jialin Dou and Marcelo Cintra", title = "A compiler cost model for speculative parallelization", journal = j-TACO, volume = "4", number = "2", pages = "12:1--12:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250732", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Speculative parallelization is a technique that allows code sections that cannot be fully analyzed by the compiler to be aggressively executed in parallel. However, while speculative parallelization can potentially deliver significant speedups, several overheads associated with this technique can limit these speedups in practice. This paper proposes a novel compiler static cost model of speculative multithreaded execution that can be used to predict the resulting performance. This model attempts to predict the expected speedups, or slowdowns, of the candidate speculative sections based on the estimation of the combined runtime effects of various overheads, and taking into account the scheduling restrictions of most speculative execution environments. The model is based on estimating the likely execution duration of threads and considers all the possible permutations of these threads. This model also produces a quantitative estimate of the speedup, which is different from prior heuristics that only qualitatively estimate the benefits of speculative multithreaded execution. In previous work, a limited version of the framework was evaluated on a number of loops from a collection of SPEC benchmarks that suffer mainly from load imbalance and thread dispatch and commit overheads. In this work, an extended framework is also evaluated on loops that may suffer from data-dependence violations. Experimental results show that prediction accuracy is lower when loops with violations are included. Nevertheless, accuracy is still very high for a static model: the framework can identify, on average, 45\% of the loops that cause slowdowns and, on average, 96\% of the loops that lead to speedups; it predicts the speedups or slowdowns with an error of less than 20\% for an average of 28\% of the loops across the benchmarks and with an error of less than 50\% for an average of 80\% of the loops. Overall, the framework often outperforms, by as much as 25\%, a naive approach that attempts to speculatively parallelize all the loops considered, and is able to curb the large slowdowns caused in many cases by this naive approach.", acknowledgement = ack-nhfb, articleno = "12", keywords = "speculative multithreading; speculative parallelization; thread-level speculation", } @Article{Amme:2007:SBM, author = "Wolfram Amme and Jeffery von Ronne and Michael Franz", title = "{SSA}-based mobile code: {Implementation} and empirical evaluation", journal = j-TACO, volume = "4", number = "2", pages = "13:1--13:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250733", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Although one might expect transportation formats based on static single-assignment form (SSA) to yield faster just-in-time compilation times than those based on stack-based virtual machines, this claim has not previously been validated, in practice. We attempt to quantify the effect of using an SSA-based mobile code representation by integrating support for a verifiable SSA-based IR into Jikes RVM. Performance results, measured with various optimizations and on both the IA32 and PowerPC, show improvements in both compilation time and code quality.", acknowledgement = ack-nhfb, articleno = "13", keywords = "SafeTSA; static single-assignment form; virtual machines", } @Article{Li:2007:CCE, author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and Yuanyuan Zhou", title = "Cross-component energy management: {Joint} adaptation of processor and memory", journal = j-TACO, volume = "4", number = "3", pages = "14:1--14:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275938", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Researchers have proposed the use of adaptation to reduce the energy consumption of different hardware components, such as the processor, memory, disk, and display for general-purpose applications. Previous algorithms to control these adaptations, however, have focused on a single component. This work takes the first step toward developing algorithms that can jointly control adaptations in multiple interacting components for general-purpose applications, with the goal of minimizing the total energy consumed within a specified performance loss. Specifically, we develop a joint-adaptation algorithm for processor and memory adaptations. We identify two properties that enable per-component algorithms to be easily used in a cross-component context---the algorithms' performance impact must be guaranteed and composable. We then modify a current processor and a memory algorithm to obey these properties. This allows the cross-component problem to be reduced to determine an appropriate (energy-optimal) allocation of the target performance loss (slack) between the two components. We develop such an optimal slack allocation algorithm that exploits the above properties. The result is an efficient cross-component adaptation framework that minimizes the total energy of the processor and memory without exceeding the target performance loss, while substantially leveraging current per-component algorithms. Our experiments show that joint processor and memory adaptation provides significantly more energy savings than adapting either component alone; intelligent slack distribution is specifically effective for highly compute- or memory-intensive applications; and the performance slowdown never exceeds the specification.", acknowledgement = ack-nhfb, articleno = "14", keywords = "adaptive systems; control algorithms; energy management; low-power design; memory; performance guarantee; processor", } @Article{Gabor:2007:FES, author = "Ron Gabor and Shlomo Weiss and Avi Mendelson", title = "Fairness enforcement in switch on event multithreading", journal = j-TACO, volume = "4", number = "3", pages = "15:1--15:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275939", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The need to reduce power and complexity will increase the interest in Switch On Event multithreading (coarse-grained multithreading). Switch On Event multithreading is a low-power and low-complexity mechanism to improve processor throughput by switching threads on execution stalls. Fairness may, however, become a problem in a multithreaded processor. Unless fairness is properly handled, some threads may starve while others consume all of the processor cycles. Heuristics that were devised in order to improve fairness in simultaneous multithreading are not applicable to Switch On Event multithreading. This paper defines the fairness metric using the ratio of the individual threads' speedups and shows how it can be enforced in Switch On Event multithreading. Fairness is controlled by forcing additional thread switch points. These switch points are determined dynamically by runtime estimation of the single threaded performance of each of the individual threads. We analyze the impact of the fairness enforcement mechanism on aggregate IPC and weighted speedup. We present simulation results of the performance of Switch On Event multithreading. Switch On Event multithreading achieves an average aggregate IPC increase of 26\% over single thread and 12\% weighted speedup when no fairness is enforced. In this case, a sixth of our runs resulted in poor fairness in which one thread ran extremely slowly (10 to 100 times slower than its single-thread performance), while the other thread's performance was hardly affected. By using the proposed mechanism, we can guarantee fairness at different levels of strictness and, in most cases, even improve the weighted speedup.", acknowledgement = ack-nhfb, articleno = "15", keywords = "coarse-grained multithreading; fairness; multithreading; performance; SOE; Switch on Event multithreading; throughput; weighted speedup", } @Article{Andrade:2007:PAA, author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n Doallo", title = "Precise automatable analytical modeling of the cache behavior of codes with indirections", journal = j-TACO, volume = "4", number = "3", pages = "16:1--16:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275940", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The performance of memory hierarchies, in which caches play an essential role, is critical in nowadays general-purpose and embedded computing systems because of the growing memory bottleneck problem. Unfortunately, cache behavior is very unstable and difficult to predict. This is particularly true in the presence of irregular access patterns, which exhibit little locality. Such patterns are very common, for example, in applications in which pointers or compressed sparse matrices give place to indirections. Nevertheless, cache behavior in the presence of irregular access patterns has not been widely studied. In this paper we present an extension of a systematic analytical modeling technique based on PMEs (probabilistic miss equations), previously developed by the authors, that allows the automated analysis of the cache behavior for codes with irregular access patterns resulting from indirections. The model generates very accurate predictions despite the irregularities and has very low computing requirements, being the first model that gathers these desirable characteristics that can automatically analyze this kind of codes. These properties enable this model to help drive compiler optimizations, as we show with an example.", acknowledgement = ack-nhfb, articleno = "16", keywords = "analytical modeling; irregular access patterns; memory hierarchy; performance prediction", } @Article{Venstermans:2007:JOH, author = "Kris Venstermans and Lieven Eeckhout and Koen De Bosschere", title = "{Java} object header elimination for reduced memory consumption in 64-bit virtual machines", journal = j-TACO, volume = "4", number = "3", pages = "17:1--17:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275941", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Memory performance is an important design issue for contemporary computer systems given the huge processor/memory speed gap. This paper proposes a space-efficient Java object model for reducing the memory consumption of 64-bit Java virtual machines. We completely eliminate the object header through typed virtual addressing (TVA) or implicit typing. TVA encodes the object type in the object's virtual address by allocating all objects of a given type in a contiguous memory segment. This allows for removing the type information as well as the status field from the object header. Whenever type and status information is needed, masking is applied to the object's virtual address for obtaining an offset into type and status information structures. Unlike previous work on implicit typing, we apply TVA to a selected number of frequently allocated object types, hence, the name selective TVA (STVA); this limits the amount of memory fragmentation. In addition to applying STVA, we also compress the type information block (TIB) pointers for all objects that do not fall under TVA. We implement the space-efficient Java object model in the 64-bit version of the Jikes RVM on an AIX IBM platform and compare its performance against the traditionally used Java object model using a multitude of Java benchmarks. We conclude that the space-efficient Java object model reduces memory consumption by on average 15\% (and up to 45\% for some benchmarks). About one-half the reduction comes from TIB pointer compression; the other one-half comes from STVA. In terms of performance, the space-efficient object model generally does not affect performance; however, for some benchmarks we observe statistically significant performance speedups, up to 20\%.", acknowledgement = ack-nhfb, articleno = "17", keywords = "64-bit implementation; implicit typing; Java object model; typed virtual addressing; Virtual machine", } @Article{Xiao:2007:VIS, author = "Shu Xiao and Edmund M.-K. Lai", title = "{VLIW} instruction scheduling for minimal power variation", journal = j-TACO, volume = "4", number = "3", pages = "18:1--18:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275942", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The focus of this paper is on the minimization of the variation in power consumed by a VLIW processor during the execution of a target program through instruction scheduling. The problem is formulated as a mixed-integer program (MIP) and a problem-specific branch-and-bound algorithm has been developed to solve it more efficiently than generic MIP solvers. Simulation results based on the TMS320C6711 VLIW digital signal processor using benchmarks from Mediabench and Trimaran showed that over 40\% average reduction in power variation can be achieved without sacrificing execution speed of these benchmarks. Computational requirements and convergence rates of our algorithm are also analyzed.", acknowledgement = ack-nhfb, articleno = "18", keywords = "instruction scheduling; power variation reduction; VLIW processors", } @Article{Tallam:2007:UCF, author = "Sriraman Tallam and Rajiv Gupta", title = "Unified control flow and data dependence traces", journal = j-TACO, volume = "4", number = "3", pages = "19:1--19:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275943", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We describe the design, generation, and compression of the extended whole program path (eWPP), representation that not only captures the control flow history of a program execution but also its data dependence history. This representation is motivated by the observation that, typically, a significant fraction of data dependence history can be recovered from the control flow trace. To capture the remainder of the data dependence history, we introduce disambiguation checks in the program whose control flow signatures capture the results of the checks. The resulting extended control flow trace enables the recovery of otherwise irrecoverable data dependences. The code for the checks is designed to minimize the increase in program execution time and the extended control flow trace size when compared to directly collecting control flow and address traces. Our experiments show that compressed eWPPs are only one-quarter of the size of combined compressed control flow and address traces. However, their collection incurs a 5{\times} increase in runtime overhead relative to the overhead required for directly collecting the control flow and address traces, respectively.", acknowledgement = ack-nhfb, articleno = "19", keywords = "address trace; control flow trace; dynamic data dependence trace; profiling", } @Article{Ipek:2008:EAD, author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich Caruana and Bronis R. de Supinski and Martin Schulz", title = "Efficient architectural design space exploration via predictive modeling", journal = j-TACO, volume = "4", number = "4", pages = "1:1--1:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328196", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Efficiently exploring exponential-size architectural design spaces with many interacting parameters remains an open problem: the sheer number of experiments required renders detailed simulation intractable. We attack this via an automated approach that builds accurate predictive models. We simulate sampled points, using results to teach our models the function describing relationships among design parameters. The models can be queried and are very fast, enabling efficient design tradeoff discovery. We validate our approach via two uniprocessor sensitivity studies, predicting IPC with only 1--2\% error. In an experimental study using the approach, training on 1\% of a 250-K-point CMP design space allows our models to predict performance with only 4--5\% error. Our predictive modeling combines well with techniques that reduce the time taken by each simulation experiment, achieving net time savings of three-four orders of magnitude.", acknowledgement = ack-nhfb, articleno = "1", keywords = "artificial neural networks; design space exploration; performance prediction; sensitivity studies", } @Article{Shi:2008:VMS, author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David Gregg", title = "Virtual machine showdown: {Stack} versus registers", journal = j-TACO, volume = "4", number = "4", pages = "2:1--2:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328197", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Virtual machines (VMs) enable the distribution of programs in an architecture-neutral format, which can easily be interpreted or compiled. A long-running question in the design of VMs is whether a stack architecture or register architecture can be implemented more efficiently with an interpreter. We extend existing work on comparing virtual stack and virtual register architectures in three ways. First, our translation from stack to register code and optimization are much more sophisticated. The result is that we eliminate an average of more than 46\% of executed VM instructions, with the bytecode size of the register machine being only 26\% larger than that of the corresponding stack one. Second, we present a fully functional virtual-register implementation of the Java virtual machine (JVM), which supports Intel, AMD64, PowerPC and Alpha processors. This register VM supports inline-threaded, direct-threaded, token-threaded, and switch dispatch. Third, we present experimental results on a range of additional optimizations such as register allocation and elimination of redundant heap loads. On the AMD64 architecture the register machine using switch dispatch achieves an average speedup of 1.48 over the corresponding stack machine. Even using the more efficient inline-threaded dispatch, the register VM achieves a speedup of 1.15 over the equivalent stack-based VM.", acknowledgement = ack-nhfb, articleno = "2", keywords = "interpreter; register architecture; stack architecture; virtual machine", } @Article{Yan:2008:EVR, author = "Jun Yan and Wei Zhang", title = "Exploiting virtual registers to reduce pressure on real registers", journal = j-TACO, volume = "4", number = "4", pages = "3:1--3:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328198", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "It is well known that a large fraction of variables are short-lived. This paper proposes a novel approach to exploiting this fact to reduce the register pressure for pipelined processors with data-forwarding network. The idea is that the compiler can allocate virtual registers (i.e., place holders to identify dependences among instructions) to short-lived variables, which do not need to be stored to physical storage locations. As a result, real registers (i.e., physically existed registers) can be reserved for long-lived variables for mitigating the register pressure and decreasing the register spills, leading to performance improvement. In this paper, we develop the architectural and compiler support for exploiting virtual registers for statically scheduled processors. Our experimental results show that virtual registers are very effective at reducing the register spills, which, in many cases, can achieve the performance close to the processor with twice number of real registers. Our results also indicate that, for some applications, using 24 virtual, in addition to 8 real registers, can attain even higher performance than that of 16 real without any virtual registers.", acknowledgement = ack-nhfb, articleno = "3", keywords = "data forwarding; register allocation; register file; short-lived variables; virtual register", } @Article{Yu:2008:OCL, author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang", title = "Object co-location and memory reuse for {Java} programs", journal = j-TACO, volume = "4", number = "4", pages = "4:1--4:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328199", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We introduce a new memory management system, STEMA, which can improve the execution time of Java programs. STEMA detects prolific types on-the-fly and co-locates their objects in a special memory space which supports reuse of memory. We argue and show that memory reuse and co-location of prolific objects can result in improved cache locality, reduced memory fragmentation, reduced GC time, and faster object allocation. We evaluate STEMA using 16 benchmarks. Experimental results show that STEMA performs 2.7\%, 4.0\%, and 8.2\% on average better than MarkSweep, CopyMS, and SemiSpace.", acknowledgement = ack-nhfb, articleno = "4", keywords = "garbage collector; Java; memory allocator; memory reuse; mutator; object co-location", } @Article{Zhang:2008:RCM, author = "Chuanjun Zhang", title = "Reducing cache misses through programmable decoders", journal = j-TACO, volume = "4", number = "4", pages = "5:1--5:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328200", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Level-one caches normally reside on a processor's critical path, which determines clock frequency. Therefore, fast access to level-one cache is important. Direct-mapped caches exhibit faster access time, but poor hit rates, compared with same sized set-associative caches because of nonuniform accesses to the cache sets. The nonuniform accesses generate more cache misses in some sets, while other sets are underutilized. We propose to increase the decoder length and, hence, reduce the accesses to heavily used sets without dynamically detecting the cache set usage information. We increase the access to the underutilized cache sets by incorporating a replacement policy into the cache design using programmable decoders. On average, the proposed techniques achieve as low a miss rate as a traditional 4-way cache on all 26 SPEC2K benchmarks for the instruction and data caches, respectively. This translates into an average IPC improvement of 21.5 and 42.4\% for SPEC2K integer and floating-point benchmarks, respectively. The B-Cache consumes 10.5\% more power per access, but exhibits a 12\% total memory access-related energy savings as a result of the miss rate reductions, and, hence, the reduction to applications' execution time. Compared with previous techniques that aim at reducing the miss rate of direct-mapped caches, our technique requires only one cycle to access all cache hits and has the same access time of a direct-mapped cache.", acknowledgement = ack-nhfb, articleno = "5", keywords = "cache; dynamic optimization; low power", } @Article{Golander:2008:HMP, author = "Amit Golander and Shlomo Weiss", title = "Hiding the misprediction penalty of a resource-efficient high-performance processor", journal = j-TACO, volume = "4", number = "4", pages = "6:1--6:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328201", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Misprediction is a major obstacle for increasing speculative out-of-order processors performance. Performance degradation depends on both the number of misprediction events and the recovery time associated with each one of them. In recent years a few checkpoint based microarchitectures have been proposed. In comparison with ROB-based processors, checkpoint processors are scalable and highly resource efficient. Unfortunately, in these proposals the misprediction recovery time is proportional to the instruction queue size.\par In this paper we analyze methods to reduce the misprediction recovery time. We propose a new register file management scheme and techniques to selectively flush the instruction queue and the load store queue, and to isolate deeply pipelined execution units. The result is a novel checkpoint processor with Constant misprediction RollBack time (CRB). We further present a streamlined, cost-efficient solution, which saves complexity at the price of slightly lower performance.", acknowledgement = ack-nhfb, articleno = "6", keywords = "checkpoints; misprediction; out-of-order execution; rollback; scalable architecture", } @Article{Calder:2008:E, author = "Brad Calder and Dean Tullsen", title = "Editorial", journal = j-TACO, volume = "5", number = "1", pages = "1:1--1:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369397", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "1", } @Article{Mysore:2008:FIP, author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber and Timothy Sherwood and Nisheeth Shrivastava and Subhash Suri", title = "Formulating and implementing profiling over adaptive ranges", journal = j-TACO, volume = "5", number = "1", pages = "2:1--2:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369398", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Modern computer systems are called on to deal with billions of events every second, whether they are executed instructions, accessed memory locations, or forwarded packets. This presents a serious challenge to those who seek to quantify, analyze, or optimize such systems, because important trends and behaviors may easily be lost in a sea of data. We present range-adaptive profiling (RAP) as a new and general-purpose profiling method capable of hierarchically efficiently classifying streams of data in hardware. Through the use of RAP, events in an input stream are dynamically classified into increasingly precise categories, based on the frequency with which they occur. The more important a class, or range of events, the more precisely it is quantified. Despite the dynamic nature of our technique, we build upon tight theoretic bounds covering both worst-case error, as well as the required memory. In the limit, it is known that error and the memory bounds can be independent of the stream size and grow only linearly with the level of precision desired. Significantly, we expose the critical constants in these algorithms and through careful engineering, algorithm redesign, and use of heuristics, we show how a high-performance profile system can be implemented for range-adaptive profiling. RAP can be used on various profiles, such as PCs, load values, and memory addresses, and has a broad range of uses, from hot-region profiling to quantifying cache miss value locality. We propose two methods of implementation of RAP, one in software and the other with specialized hardware, for which we also describe our prototype FPGA implementation. We show that with just 8KB of memory, range profiles can be gathered with an average accuracy of 98\%.", acknowledgement = ack-nhfb, articleno = "2", keywords = "profiling hardware; range adaptive; value locality", } @Article{Zhai:2008:CHS, author = "Antonia Zhai and J. Gregory Steffan and Christopher B. Colohan and Todd C. Mowry", title = "Compiler and hardware support for reducing the synchronization of speculative threads", journal = j-TACO, volume = "5", number = "1", pages = "3:1--3:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369399", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Thread-level speculation (TLS) allows us to automatically parallelize general-purpose programs by supporting parallel execution of threads that might not actually be independent. In this article, we focus on one important limitation of program performance under TLS, which stalls as a result of synchronizing and forwarding scalar values between speculative threads that would otherwise cause frequent data dependences and, hence, failed speculation. Using SPECint benchmarks that have been automatically transformed by our compiler to exploit TLS, we present, evaluate in detail, and compare both compiler and hardware techniques for improving the communication of scalar values. We find that through our dataflow algorithms for three increasingly aggressive instruction scheduling techniques, the compiler can drastically reduce the critical forwarding path introduced by the synchronization and forwarding of scalar values. We also show that hardware techniques for reducing synchronization can be complementary to compiler scheduling, but that the additional performance benefits are minimal and are generally not worth the cost.", acknowledgement = ack-nhfb, articleno = "3", keywords = "automatic parallelization; chip-multiprocessing; instruction scheduling; thread-level speculation", } @Article{Winter:2008:ATN, author = "Jonathan A. Winter and David H. Albonesi", title = "Addressing thermal nonuniformity in {SMT} workloads", journal = j-TACO, volume = "5", number = "1", pages = "4:1--4:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369400", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We explore DTM techniques within the context of uniform and nonuniform SMT workloads. While DVS is suitable for addressing workloads with uniformly high temperatures, for nonuniform workloads, performance loss occurs because of the slowdown of the cooler thread. To address this, we propose and evaluate DTM mechanisms that exploit the steering-based thread management mechanisms inherent in a clustered SMT architecture. We show that in contrast to DVS, which operates globally, our techniques are more effective at controlling temperature for nonuniform workloads. Furthermore, we devise a DTM technique that combines steering and DVS to achieve consistently good performance across all workloads.", acknowledgement = ack-nhfb, articleno = "4", keywords = "adaptive microarchitectures; clustered microarchitectures; dynamic thermal management; dynamic voltage scaling; simultaneous multithreading", } @Article{Shahbahrami:2008:VES, author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis Vassiliadis", title = "Versatility of extended subwords and the matrix register file", journal = j-TACO, volume = "5", number = "1", pages = "5:1--5:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369401", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Extended subwords and the matrix register file (MRF) are two micro architectural techniques that address some of the limitations of existing SIMD architectures. Extended subwords are wider than the data stored in memory. Specifically, for every byte of data stored in memory, there are four extra bits in the media register file. This avoids the need for data-type conversion instructions. The MRF is a register file organization that provides both conventional row-wise, as well as column-wise, access to the register file. In other words, it allows to view the register file as a matrix in which corresponding subwords in different registers corresponds to a column of the matrix. It was introduced to accelerate matrix transposition which is a very common operation in multimedia applications. In this paper, we show that the MRF is very versatile, since it can also be used for other permutations than matrix transposition. Specifically, it is shown how it can be used to provide efficient access to strided data, as is needed in, e.g., color space conversion. Furthermore, it is shown that special-purpose instructions (SPIs), such as the sum-of-absolute differences (SAD) instruction, have limited usefulness when extended subwords and a few general SIMD instructions that we propose are supported, for the following reasons. First, when extended subwords are supported, the SAD instruction provides only a relatively small performance improvement. Second, the SAD instruction processes 8-bit subwords only, which is not sufficient for quarter-pixel resolution nor for cost functions used in image and video retrieval. Results obtained by extending the SimpleScalar toolset show that the proposed techniques provide a speedup of up to 3.00 over the MMX architecture. The results also show that using, at most, 13 extra media registers yields an additional performance improvement ranging from 1.38 to 1.57.", acknowledgement = ack-nhfb, articleno = "5", keywords = "multimedia standards; SIMD architectures; SIMD programming", } @Article{Guo:2008:EHC, author = "Zhi Guo and Walid Najjar and Betul Buyukkurt", title = "Efficient hardware code generation for {FPGAs}", journal = j-TACO, volume = "5", number = "1", pages = "6:1--6:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369402", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The wider acceptance of FPGAs as a computing device requires a higher level of programming abstraction. ROCCC is an optimizing C to HDL compiler. We describe the code generation approach in ROCCC. The smart buffer is a component that reuses input data between adjacent iterations. It significantly improves the performance of the circuit and simplifies loop control. The ROCCC-generated datapath can execute one loop iteration per clock cycle when there is no loop dependency or there is only scalar recurrence variable dependency. ROCCC's approach to supporting while-loops operating on scalars makes the compiler able to move scalar iterative computation into hardware.", acknowledgement = ack-nhfb, articleno = "6", keywords = "data reuse; FPGA; high-level synthesis; reconfigurable computing; VHDL", } @Article{Kotzmann:2008:DJH, author = "Thomas Kotzmann and Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth Russell and David Cox", title = "Design of the {Java HotSpot\TM} client compiler for {Java 6}", journal = j-TACO, volume = "5", number = "1", pages = "7:1--7:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1370017", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM ships with a redesigned version of the client just-in-time compiler that includes several research results of the last years. The client compiler is at the heart of the VM configuration used by default for interactive desktop applications. For such applications, low startup and pause times are more important than peak performance. This paper outlines the new architecture of the client compiler and shows how it interacts with the VM. It presents the intermediate representation that now uses static single-assignment (SSA) form and the linear scan algorithm for global register allocation. Efficient support for exception handling and deoptimization fulfills the demands that are imposed by the dynamic features of the Java programming language. The evaluation shows that the new client compiler generates better code in less time. The popular SPECjvm98 benchmark suite is executed 45\% faster, while the compilation speed is also up to 40\% better. This indicates that a carefully selected set of global optimizations can also be integrated in just-in-time compilers that focus on compilation speed and not on peak performance. In addition, the paper presents the impact of several optimizations on execution and compilation speed. As the source code is freely available, the Java HotSpot{\TM} VM and the client compiler are the ideal basis for experiments with new feedback-directed optimizations in a production-level Java just-in-time compiler. The paper outlines research projects that add fast algorithms for escape analysis, automatic object inlining, and array bounds check elimination.", acknowledgement = ack-nhfb, articleno = "7", keywords = "compiler; deoptimization; intermediate representation; Java; just-in-time compilation; optimization; register allocation", } @Article{Rangan:2008:PSD, author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni and David I. August", title = "Performance scalability of decoupled software pipelining", journal = j-TACO, volume = "5", number = "2", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400113", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Any successful solution to using multicore processors to scale general-purpose program performance will have to contend with rising intercore communication costs while exposing coarse-grained parallelism. Recently proposed pipelined multithreading (PMT) techniques have been demonstrated to have general-purpose applicability and are also able to effectively tolerate inter-core latencies through pipelined interthread communication. These desirable properties make PMT techniques strong candidates for program parallelization on current and future multicore processors and understanding their performance characteristics is critical to their deployment. To that end, this paper evaluates the performance scalability of a general-purpose PMT technique called decoupled software pipelining (DSWP) and presents a thorough analysis of the communication bottlenecks that must be overcome for optimal DSWP scalability.", acknowledgement = ack-nhfb, articleno = "8", keywords = "decoupled software pipelining; performance analysis", } @Article{Long:2008:TMM, author = "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and Rajarshi Mukherjee", title = "Thermal monitoring mechanisms for chip multiprocessors", journal = j-TACO, volume = "5", number = "2", pages = "9:1--9:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400114", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "With large-scale integration and increasing power densities, thermal management has become an important tool to maintain performance and reliability in modern process technologies. In the core of dynamic thermal management schemes lies accurate reading of on-die temperatures. Therefore, careful planning and embedding of thermal monitoring mechanisms into high-performance systems becomes crucial. In this paper, we propose three techniques to create sensor infrastructures for monitoring the maximum temperature on a multicore system. Initially, we extend a nonuniform sensor placement methodology proposed in the literature to handle chip multiprocessors (CMPs) and show its limitations. We then analyze a grid-based approach where the sensors are placed on a static grid covering each core and show that the sensor readings can differ from the actual maximum core temperature by as much as 12.6^\circ C when using 16 sensors per core. Also, as large as 10.6\% of the thermal emergencies are not captured using the same number of sensors. Based on this observation, we first develop an interpolation scheme, which estimates the maximum core temperature through interpolation of the readings collected at the static grid points. We show that the interpolation scheme improves the measurement accuracy and emergency coverage compared to grid-based placement when using the same number of sensors. Second, we present a dynamic scheme where only a subset of the sensor readings is collected to predict the maximum temperature of each core. Our results indicate that, we can reduce the number of active sensors by as much as 50\%, while maintaining similar measurement accuracy and emergency coverage compared to the case where the entire sensor set on the grid is sampled at all times.", acknowledgement = ack-nhfb, articleno = "9", keywords = "nonuniform and uniform sensor placement; thermal sensor allocation", } @Article{Joshi:2008:DEP, author = "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell, Jr.} and Lizy K. John", title = "Distilling the essence of proprietary workloads into miniature benchmarks", journal = j-TACO, volume = "5", number = "2", pages = "10:1--10:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400115", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Benchmarks set standards for innovation in computer architecture research and industry product development. Consequently, it is of paramount importance that these workloads are representative of real-world applications. However, composing such representative workloads poses practical challenges to application analysis teams and benchmark developers (1) real-world workloads are intellectual property and vendors hesitate to share these proprietary applications; and (2) porting and reducing these applications to benchmarks that can be simulated in a tractable amount of time is a nontrivial task. In this paper, we address this problem by proposing a technique that automatically distills key inherent behavioral attributes of a proprietary workload and captures them into a miniature synthetic benchmark clone. The advantage of the benchmark clone is that it hides the functional meaning of the code but exhibits similar performance characteristics as the target application. Moreover, the dynamic instruction count of the synthetic benchmark clone is substantially shorter than the proprietary application, greatly reducing overall simulation time for SPEC CPU, the simulation time reduction is over five orders of magnitude compared to entire benchmark execution. Using a set of benchmarks representative of general-purpose, scientific, and embedded applications, we demonstrate that the power and performance characteristics of the synthetic benchmark clone correlate well with those of the original application across a wide range of microarchitecture configurations.", acknowledgement = ack-nhfb, articleno = "10", keywords = "benchmark cloning; benchmarks; workload characterization", } @Article{Catania:2008:RCM, author = "Vincenzo Catania and Maurizio Palesi and Davide Patti", title = "Reducing complexity of multiobjective design space exploration in {VLIW}-based embedded systems", journal = j-TACO, volume = "5", number = "2", pages = "11:1--11:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400116", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Architectures based on very-long instruction word (VLIW) have found fertile ground in multimedia electronic appliances thanks to their ability to exploit high degrees of instruction level parallelism (ILP) with a reasonable trade-off in complexity and silicon cost. Specialization of such architectures involves the configuration of both hardware-related aspects (e.g., register files, functional units, memory subsystem) and software-related issues (e.g., the compilation strategy). The complex interactions between the components of such systems will force a human designer to rely on judgment and experience in designing them, possibly eliminating interesting configurations, and making tuning of the system, for either power, energy, or performance, difficult. In this paper we propose tools and methodologies to efficiently cope with this complexity from a multiobjective perspective. We first analyze the impact of ILP-oriented code transformations using two alternative compilation profiles to quantitatively show the effect of such transformations on typical design objectives like performance, power dissipation, and energy consumption. Next, by means of statistical analysis, we collect useful data to predict the effectiveness of a given compilation profiles for a specific application. Information gathered from such analysis can be exploited to drastically reduce the computational effort needed to perform the design space exploration.", acknowledgement = ack-nhfb, articleno = "11", keywords = "design space exploration; energy; genetic algorithms; hyperblock formation; ILP; multiobjective optimization; performances; power; statistical analysis; VLIW architectures", }