%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.14",
%%%     date            = "28 August 2008",
%%%     time            = "13:26:22 MDT",
%%%     filename        = "taco.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "59331 2653 13011 118493",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Architecture and Code
%%%                        Optimization; bibliography; TACO",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Architecture and Code
%%%                        Optimization (CODEN ????, ISSN 1544-3566),
%%%                        covering all journal issues from 2004 --
%%%                        date.
%%%
%%%                        At version 1.14, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2004 (  17)    2006 (  19)    2008 (  17)
%%%                             2005 (  17)    2007 (  19)
%%%
%%%                             Article:         89
%%%
%%%                             Total entries:   89
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://www.acm.org/pubs/taco.html
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/taco/
%%%                            http://portal.acm.org/browse_dl.cfm?linked=1&part=transaction&idx=J924
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================

@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-TACO                  = "ACM Transactions on Architecture and
                                  Code Optimization"}

%%% ====================================================================
%%% Bibliography entries:

@Article{Calder:2004:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhang:2004:RIC,
  author =       "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
                 and N. Vijaykrishnan and M. J. Irwin",
  title =        "Reducing instruction cache energy consumption using a
                 compiler-based strategy",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "3--33",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Isailovic:2004:DCQ,
  author =       "Nemanja Isailovic and Mark Whitney and Yatish Patel
                 and John Kubiatowicz and Dean Copsey and Frederic T.
                 Chong and Isaac L. Chuang and Mark Oskin",
  title =        "Datapath and control for quantum wires",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "34--61",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Sankaralingam:2004:TPA,
  author =       "Karthikeyan Sankaralingam and Ramadass Nagarajan and
                 Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
                 Ranganathan and Doug Burger and Stephen W. Keckler and
                 Robert G. McDonald and Charles R. Moore",
  title =        "{TRIPS}: {A} polymorphous architecture for exploiting
                 {ILP}, {TLP}, and {DLP}",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "62--93",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Skadron:2004:TAM,
  author =       "Kevin Skadron and Mircea R. Stan and Karthik
                 Sankaranarayanan and Wei Huang and Sivakumar Velusamy
                 and David Tarjan",
  title =        "Temperature-aware microarchitecture: {Modeling} and
                 implementation",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "94--125",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Aleta:2004:RCC,
  author =       "Alex Alet{\`a} and Josep M. Codina and Antonio
                 Gonz{\'a}lez and David Kaeli",
  title =        "Removing communications in clustered
                 microarchitectures through instruction replication",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "127--151",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Bai:2004:LPO,
  author =       "Yu Bai and R. Iris Bahar",
  title =        "A low-power in-order\slash out-of-order issue queue",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "152--179",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Juang:2004:IBP,
  author =       "Philo Juang and Kevin Skadron and Margaret Martonosi
                 and Zhigang Hu and Douglas W. Clark and Philip W.
                 Diodato and Stefanos Kaxiras",
  title =        "Implementing branch-predictor decay using quasi-static
                 memory cells",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "180--219",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Santana:2004:LCF,
  author =       "Oliverio J. Santana and Alex Ramirez and Josep L.
                 Larriba-Pey and Mateo Valero",
  title =        "A low-complexity fetch architecture for
                 high-performance superscalar processors",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "220--245",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Lin:2004:CFS,
  author =       "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
                 Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun
                 Chan",
  title =        "A compiler framework for speculative optimizations",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "247--271",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Fields:2004:ICS,
  author =       "Brian A. Fields and Rastislav Bodik and Mark D. Hill
                 and Chris J. Newburn",
  title =        "Interaction cost and shotgun profiling",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "272--304",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Sankaranarayanan:2004:PBA,
  author =       "Karthik Sankaranarayanan and Kevin Skadron",
  title =        "Profile-based adaptation for cache decay",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "305--322",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Xie:2004:IDV,
  author =       "Fen Xie and Margaret Martonosi and Sharad Malik",
  title =        "Intraprogram dynamic voltage scaling: {Bounding}
                 opportunities with analytic modeling",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "323--367",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Hartstein:2004:OPD,
  author =       "A. Hartstein and Thomas R. Puzak",
  title =        "The optimum pipeline depth considering both power and
                 performance",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "369--388",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Cristal:2004:TKI,
  author =       "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
                 Valero and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Toward kilo-instruction processors",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "389--417",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Akkary:2004:ARE,
  author =       "Haitham Akkary and Ravi Rajwar and Srikanth T.
                 Srinivasan",
  title =        "An analysis of a resource efficient checkpoint
                 architecture",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "418--444",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Yang:2004:TML,
  author =       "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
                 and Chien-Hao Lee",
  title =        "Tolerating memory latency through push prefetching for
                 pointer-intensive applications",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "445--475",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Calder:2005:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhou:2005:EFA,
  author =       "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
                 and Josep Torrellas",
  title =        "Efficient and flexible architectural support for
                 dynamic monitoring",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "3--33",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhang:2005:WHC,
  author =       "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
                 Najjar",
  title =        "A way-halting cache for low-energy high-performance
                 systems",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "34--54",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Abella:2005:ISP,
  author =       "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
                 and Michael F. P. O'Boyle",
  title =        "{IATAC}: a smart predictor to turn-off {L2} cache
                 lines",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "55--77",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Haskins:2005:AWS,
  author =       "John W. {Haskins, Jr.} and Kevin Skadron",
  title =        "Accelerated warmup for sampled microarchitecture
                 simulation",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "78--108",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Li:2005:ABT,
  author =       "Tao Li and Ravi Bhargava and Lizy Kurian John",
  title =        "Adapting branch-target buffer to improve the target
                 predictability of {Java} code",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "109--130",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhang:2005:DIE,
  author =       "Lingli Zhang and Chandra Krintz",
  title =        "The design, implementation, and evaluation of adaptive
                 code unloading for resource-constrained devices",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "131--164",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Kulkarni:2005:FES,
  author =       "Prasad A. Kulkarni and Stephen R. Hines and David B.
                 Whalley and Jason D. Hiser and Jack W. Davidson and
                 Douglas L. Jones",
  title =        "Fast and efficient searches for effective
                 optimization-phase sequences",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "165--198",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Salami:2005:DMI,
  author =       "Esther Salam{\'\i} and Mateo Valero",
  title =        "Dynamic memory interval test vs. interprocedural
                 pointer analysis in multimedia applications",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "199--219",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Meng:2005:ELL,
  author =       "Yan Meng and Timothy Sherwood and Ryan Kastner",
  title =        "Exploring the limits of leakage power reduction in
                 caches",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "221--246",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Garzaran:2005:TBS,
  author =       "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
                 and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
                 Vi{\~n}als and Lawrence Rauchwerger and Josep
                 Torrellas",
  title =        "Tradeoffs in buffering speculative memory state for
                 thread-level speculation in multiprocessors",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "247--279",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Tarjan:2005:MPG,
  author =       "David Tarjan and Kevin Skadron",
  title =        "Merging path and gshare indexing in perceptron branch
                 prediction",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "280--300",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhang:2005:WET,
  author =       "Xiangyu Zhang and Rajiv Gupta",
  title =        "Whole execution traces and their applications",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "301--334",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhao:2005:IWA,
  author =       "Wankang Zhao and David Whalley and Christopher Healy
                 and Frank Mueller",
  title =        "Improving {WCET} by applying a {WC} code-positioning
                 optimization",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "335--365",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  keywords =     "WCET (worst case execution time); WC (worst case)",
}

@Article{Reis:2005:SCF,
  author =       "George A. Reis and Jonathan Chang and Neil
                 Vachharajani and Ram Rangan and David I. August and
                 Shubhendu S. Mukherjee",
  title =        "Software-controlled fault tolerance",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "366--396",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Li:2005:PPC,
  author =       "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Power-performance considerations of parallel computing
                 on chip multiprocessors",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "397--422",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Sharma:2005:SPE,
  author =       "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
  title =        "Spectral prefetcher: {An} effective mechanism for {L2}
                 cache prefetching",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "423--450",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Calder:2006:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Tan:2006:BSS,
  author =       "Lin Tan and Brett Brotherton and Timothy Sherwood",
  title =        "Bit-split string-matching engines for intrusion
                 detection and prevention",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "3--34",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Nagpurkar:2006:ERP,
  author =       "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
                 and Timothy Sherwood",
  title =        "Efficient remote profiling for resource-constrained
                 devices",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "35--66",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Lin:2006:RCG,
  author =       "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
                 Dz-Ching Ju and Tin-Fook Ngai",
  title =        "Recovery code generation for general speculative
                 optimizations",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "67--89",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Choi:2006:ORR,
  author =       "Yoonseo Choi and Hwansoo Han",
  title =        "Optimal register reassignment for register stack
                 overflow minimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "90--114",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Xue:2006:LOA,
  author =       "Jingling Xue and Qiong Cai",
  title =        "A lifetime optimal algorithm for speculative {PRE}",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "115--155",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Sharkey:2006:IPT,
  author =       "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
                 Ghose and Oguz Ergin",
  title =        "Instruction packing: {Toward} fast and
                 energy-efficient instruction scheduling",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "156--181",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Ceze:2006:CUC,
  author =       "Luis Ceze and Karin Strauss and James Tuck and Josep
                 Torrellas and Jose Renau",
  title =        "{CAVA}: {Using} checkpoint-assisted value prediction
                 to hide {L2} misses",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "182--208",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhang:2006:EAR,
  author =       "Lixin Zhang and Mike Parker and John Carter",
  title =        "Efficient address remapping in distributed
                 shared-memory systems",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "209--229",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Zhao:2006:ATP,
  author =       "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
  title =        "An approach toward profit-driven optimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "231--262",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162691",
  ISSN =         "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Although optimizations have been applied for a number
                 of years to improve the performance of software,
                 problems with respect to the application of
                 optimizations have not been adequately addressed. For
                 example, in certain circumstances, optimizations may
                 degrade performance. However, there is no efficient way
                 to know when a degradation will occur. In this
                 research, we investigate the profitability of
                 optimizations, which is useful for determining the
                 benefit of applying optimizations. We develop a
                 framework that enables us to predict profitability
                 using analytic models. The profitability of an
                 optimization depends on code context, the particular
                 optimization, and machine resources. Thus, our
                 framework has analytic models for each of these
                 components. As part of the framework, there is also a
                 profitability engine that uses models to predict the
                 profit. In this paper, we target scalar optimizations
                 and, in particular, describe the models for partial
                 redundancy elimination (PRE), loop invariant code
                 motion (LICM), and value numbering (VN). We implemented
                 the framework for predicting the profitability of these
                 optimizations. Based on the predictions, we can
                 selectively apply profitable optimizations. We compared
                 the profit-driven approach with an approach that uses a
                 heuristic in deciding when optimizations should be
                 applied. Our experiments demonstrate that the
                 profitability of scalar optimizations can be accurately
                 predicted by using models. That is, without actually
                 applying a scalar optimization, we can determine if an
                 optimization is beneficial and should be applied.",
  acknowledgement = ack-nhfb,
}

@Article{Hazelwood:2006:MBC,
  author =       "Kim Hazelwood and Michael D. Smith",
  title =        "Managing bounded code caches in dynamic binary
                 optimization systems",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "263--294",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162692",
  ISSN =         "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Dynamic binary optimizers store altered copies of
                 original program instructions in software-managed code
                 caches in order to maximize reuse of transformed code.
                 Code caches store code blocks that may vary in size,
                 reference other code blocks, and carry a high
                 replacement overhead. These unique constraints reduce
                 the effectiveness of conventional cache management
                 policies. Our work directly addresses these unique
                 constraints and presents several contributions to the
                 code-cache management problem. First, we show that
                 evicting more than the minimum number of code blocks
                 from the code cache results in less run-time overhead
                 than the existing alternatives. Such granular evictions
                 reduce overall execution time, as the fixed costs of
                 invoking the eviction mechanism are amortized across
                 multiple cache insertions. Second, a study of the ideal
                 lifetimes of dynamically generated code blocks
                 illustrates the benefit of a replacement algorithm
                 based on a generational heuristic. We describe and
                 evaluate a generational approach to code cache
                 management that makes it easy to identify long-lived
                 code blocks and simultaneously avoid any fragmentation
                 because of the eviction of short-lived blocks. Finally,
                 we present results from an implementation of our
                 generational approach in the DynamoRIO framework and
                 illustrate that, as dynamic optimization systems become
                 more prevalent, effective code cache-management
                 policies will be essential for reliable, scalable
                 performance of modern applications.",
  acknowledgement = ack-nhfb,
}

@Article{Rochecouste:2006:CCE,
  author =       "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
                 Seznec",
  title =        "A case for a complexity-effective, width-partitioned
                 microarchitecture",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "295--326",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162693",
  ISSN =         "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  abstract =     "The analysis of program executions reveals that most
                 integer and multimedia applications make heavy use of
                 narrow-width operations, i.e., instructions exclusively
                 using narrow-width operands and producing a
                 narrow-width result. Moreover, this usage is relatively
                 well distributed over the application. We observed this
                 program property on the MediaBench and SPEC2000
                 benchmarks with about 40\% of the instructions being
                 narrow-width operations. Current superscalar processors
                 use 64-bit datapaths to execute all the instructions of
                 the applications. In this paper, we suggest the use of
                 a width-partitioned microarchitecture (WPM) to master
                 the hardware complexity of a superscalar processor. For
                 a four-way issue machine, we split the processor in two
                 two-way clusters: the main cluster executing 64-bit
                 operations, load/store, and complex operations and a
                 narrow cluster executing the 16-bit operations. We
                 resort to partitioning to decouple the treatment of the
                 narrow-width operations from that of the other program
                 instructions. This provides the benefit of greatly
                 simplifying the design of the critical processor
                 components in each cluster (e.g., the register file and
                 the bypass network). The dynamic interleaving of the
                 two instruction types allows maintaining the workload
                 balanced among clusters. WPM also helps to reduce the
                 complexity of the interconnection fabric and of the
                 issue logic. In fact, since the 16-bit cluster can only
                 communicate narrow-width data, the datapath-width of
                 the interconnect fabric can be significantly reduced,
                 yielding a corresponding saving of the interconnect
                 power and area. We explore different possible
                 configurations of WPM, discussing the various
                 implementation tradeoffs. We also examine a speculative
                 steering heuristic to distribute the narrow-width
                 operations among clusters. A detailed analysis of the
                 complexity factors shows using WPM instead of a
                 classical 64-bit two-cluster microarchitecture can save
                 power and silicon area with a minimal impact on the
                 overall performance.",
  acknowledgement = ack-nhfb,
}

@Article{Zmily:2006:BAI,
  author =       "Ahmad Zmily and Christos Kozyrakis",
  title =        "Block-aware instruction set architecture",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "327--357",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162694",
  ISSN =         "1544-3566",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Instruction delivery is a critical component for
                 wide-issue, high-frequency processors since its
                 bandwidth and accuracy place an upper limit on
                 performance. The processor front-end accuracy and
                 bandwidth are limited by instruction-cache misses,
                 multicycle instruction-cache accesses, and target or
                 direction mispredictions for control-flow operations.
                 This paper presents a block-aware instruction set
                 (BLISS) that allows software to assist with front-end
                 challenges. BLISS defines basic block descriptors that
                 are stored separately from the actual instructions in a
                 program. We show that BLISS allows for a decoupled
                 front-end that tolerates instruction-cache latency,
                 facilitates instruction prefetching, and leads to
                 higher prediction accuracy.",
  acknowledgement = ack-nhfb,
}

@Article{Crandall:2006:MAS,
  author =       "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
                 Chong",
  title =        "{Minos}: {Architectural} support for protecting
                 control data",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "359--389",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Marathe:2006:ACC,
  author =       "Jaydeep Marathe and Frank Mueller and Bronis R. de
                 Supinski",
  title =        "Analysis of cache-coherence bottlenecks with hybrid
                 hardware\slash software techniques",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "390--423",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Ganusov:2006:FEP,
  author =       "Ilya Ganusov and Martin Burtscher",
  title =        "Future execution: {A} prefetching mechanism that uses
                 multiple cores to speed up single threads",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "424--449",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Co:2006:ETC,
  author =       "Michele Co and Dee A. B. Weikle and Kevin Skadron",
  title =        "Evaluating trace cache energy efficiency",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "450--476",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Hu:2006:EMM,
  author =       "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
  title =        "Effective management of multiple configurable units
                 using dynamic optimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "477--501",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
}

@Article{Bentley:2006:IAB,
  author =       "Chris Bentley and Scott A. Watterson and David K.
                 Lowenthal and Barry Rountree",
  title =        "Implicit array bounds checking on 64-bit
                 architectures",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "502--527",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1187976.1187982",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Several programming languages guarantee that array
                 subscripts are checked to ensure they are within the
                 bounds of the array. While this guarantee improves the
                 correctness and security of array-based code, it adds
                 overhead to array references. This has been an obstacle
                 to using higher-level languages, such as Java, for
                 high-performance parallel computing, where the language
                 specification requires that all array accesses must be
                 checked to ensure they are within bounds. This is
                 because, in practice, array-bounds checking in
                 scientific applications may increase execution time by
                 more than a factor of 2. Previous research has explored
                 optimizations to statically eliminate bounds checks,
                 but the dynamic nature of many scientific codes makes
                 this difficult or impossible. Our approach is, instead,
                 to create a compiler and operating system
                 infrastructure that does not generate explicit bounds
                 checks. It instead places arrays inside of Index
                 Confinement Regions (ICRs), which are large, isolated,
                 mostly unmapped virtual memory regions. Any array
                 reference outside of its bounds will cause a protection
                 violation; this provides implicit bounds checking. Our
                 results show that when applying this infrastructure to
                 high-performance computing programs written in Java,
                 the overhead of bounds checking relative to a program
                 with no bounds checks is reduced from an average of
                 63\% to an average of 9\%.",
  acknowledgement = ack-nhfb,
}

@Article{Calder:2007:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "1",
}

@Article{Constantinides:2007:ARC,
  author =       "Kypros Constantinides and Stephen Plaza and Jason
                 Blome and Valeria Bertacco and Scott Mahlke and Todd
                 Austin and Bin Zhang and Michael Orshansky",
  title =        "Architecting a reliable {CMP} switch architecture",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "2",
}

@Article{Hwang:2007:SSA,
  author =       "Yuan-Shin Hwang and Jia-Jhe Li",
  title =        "Snug set-associative caches: {Reducing} leakage power
                 of instruction and data caches with no performance
                 penalties",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "6",
}

@Article{Luo:2007:CNP,
  author =       "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
  title =        "Conserving network processor power consumption by
                 exploiting traffic variability",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "4",
}

@Article{Rong:2007:SDS,
  author =       "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
                 Alban Douillet and Guang R. Gao",
  title =        "Single-dimension software pipelining for
                 multidimensional loops",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "7",
}

@Article{Sasanka:2007:AES,
  author =       "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
                 Yen-Kuang Chen and Eric Debes",
  title =        "{ALP}: {Efficient} support for all levels of
                 parallelism for complex media applications",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "3",
}

@Article{Soteriou:2007:SDP,
  author =       "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
  title =        "Software-directed power-aware interconnection
                 networks",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "5",
}

@Article{Bower:2007:ODH,
  author =       "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
  title =        "Online diagnosis of hard faults in microprocessors",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250728",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "We develop a microprocessor design that tolerates hard
                 faults, including fabrication defects and in-field
                 faults, by leveraging existing microprocessor
                 redundancy. To do this, we must: detect and correct
                 errors, diagnose hard faults at the field
                 deconfigurable unit (FDU) granularity, and deconfigure
                 FDUs with hard faults. In our reliable microprocessor
                 design, we use DIVA dynamic verification to detect and
                 correct errors. Our new scheme for diagnosing hard
                 faults tracks instructions' core structure occupancy
                 from decode until commit. If a DIVA checker detects an
                 error in an instruction, it increments a small
                 saturating error counter for every FDU used by that
                 instruction, including that DIVA checker. A hard fault
                 in an FDU quickly leads to an above-threshold error
                 counter for that FDU and thus diagnoses the fault. For
                 deconfiguration, we use previously developed schemes
                 for functional units and buffers and present a scheme
                 for deconfiguring DIVA checkers. Experimental results
                 show that our reliable microprocessor quickly and
                 accurately diagnoses each hard fault that is injected
                 and continues to function, albeit with somewhat
                 degraded performance.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  keywords =     "fine-grained diagnosis; hard fault tolerance;
                 processor microarchitecture",
}

@Article{Michaud:2007:STM,
  author =       "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
                 and Yiannakis Sazeides and Theofanis Constantinou",
  title =        "A study of thread migration in temperature-constrained
                 multicores",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250729",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Temperature has become an important constraint in
                 high-performance processors, especially multicores.
                 Thread migration will be essential to exploit the full
                 potential of future thermally constrained multicores.
                 We propose and study a thread migration method that
                 maximizes performance under a temperature constraint,
                 while minimizing the number of migrations and ensuring
                 fairness between threads. We show that thread migration
                 brings important performance gains and that it is most
                 effective during the first tens of seconds following a
                 decrease of the number of running threads.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  keywords =     "multicore processor; power density; temperature;
                 thermal management; thread migration",
}

@Article{Chen:2007:CRL,
  author =       "Yu Chen and Fuxin Zhang",
  title =        "Code reordering on limited branch offset",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250730",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Since the 1980's code reordering has gained popularity
                 as an important way to improve the spatial locality of
                 programs. While the effect of the processor's
                 microarchitecture and memory hierarchy on this
                 optimization technique has been investigated, little
                 research has focused on the impact of the instruction
                 set. In this paper, we analyze the effect of limited
                 branch offset of the MIPS-like instruction set [Hwu et
                 al. 2004, 2005] on code reordering, explore two simple
                 methods to handle the exceeded branches, and propose
                 the bidirectional code layout (BCL) algorithm to reduce
                 the number of branches exceeding the offset limit. The
                 BCL algorithm sorts the chains according to the
                 position of related chains, avoids cache conflict
                 misses deliberately and lays out the code
                 bidirectionally. It strikes a balance among the
                 distance of related blocks, the instruction cache miss
                 rate, the memory size required, and the control flow
                 transfer. Experimental results show that BCL can
                 effectively reduce exceeded branches by 50.1\%, on
                 average, with up to 100\% for some programs. Except for
                 some programs with little spatial locality, the BCL
                 algorithm can achieve the performance, as the case with
                 no branch offset limitation.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  keywords =     "code reordering; Godson Processor; link-time
                 optimization",
}

@Article{Terechko:2007:ICC,
  author =       "A. S. Terechko and H. Corporaal",
  title =        "Inter-cluster communication in {VLIW} architectures",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250731",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "The traditional VLIW (very long instruction word)
                 architecture with a single register file does not scale
                 up well to address growing performance demands on
                 embedded media processors. However, splitting a VLIW
                 processor in smaller clusters, which are comprised of
                 function units fully connected to local register files,
                 can significantly improve VLSI implementation
                 characteristics of the processor, such as speed, energy
                 consumption, and area. In our paper we reveal that
                 achieving the best characteristics of a clustered VLIW
                 requires a thorough selection of an Inter-cluster
                 Communication (ICC) model, which is the way clustering
                 is exposed in the Instruction Set Architecture. For our
                 study we, first, define a taxonomy of ICC models
                 including copy operations, dedicated issue slots,
                 extended operands, extended results, and multicast.
                 Evaluation of the execution time of the models requires
                 both the dynamic cycle count and clock period. We
                 developed an advanced instruction scheduler for all the
                 five ICC models in order to quantify the dynamic cycle
                 counts of our multimedia C benchmarks. To assess the
                 clock period of the ICC models we designed and laid out
                 VLIW datapaths using the RTL hardware descriptions
                 derived from a deeply pipelined commercial TriMedia
                 processor. In contrast to prior art, our research shows
                 that fully distributed register file architectures
                 (with eight clusters in our study) often underperform
                 compared to moderately clustered machines with two or
                 four clusters because of explosion of the cycle count
                 overhead in the former. Among the evaluated ICC models,
                 performance of the copy operation model, popular both
                 in academia and industry, is severely limited by the
                 copy operations hampering scheduling of regular
                 operations in high ILP (instruction-level parallelism)
                 code. The dedicated issue slots model combats this
                 limitation by dedicating extra VLIW issue slots purely
                 for ICC, reaching the highest 1.74 execution time
                 speedup relative to the unicluster. Furthermore, our
                 VLSI experiments show that the lowest area and energy
                 consumption of 42 and 57\% relative to the unicluster,
                 respectively, are achieved by the extended operands
                 model, which, nevertheless, provides higher performance
                 than the copy operation model.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  keywords =     "clock frequency; cluster assignment; instruction-level
                 parallelism; instruction scheduler; intercluster
                 communication; optimizing compiler; pipelining;
                 register allocation; VLIW",
}

@Article{Dou:2007:CCM,
  author =       "Jialin Dou and Marcelo Cintra",
  title =        "A compiler cost model for speculative
                 parallelization",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250732",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Speculative parallelization is a technique that allows
                 code sections that cannot be fully analyzed by the
                 compiler to be aggressively executed in parallel.
                 However, while speculative parallelization can
                 potentially deliver significant speedups, several
                 overheads associated with this technique can limit
                 these speedups in practice. This paper proposes a novel
                 compiler static cost model of speculative multithreaded
                 execution that can be used to predict the resulting
                 performance. This model attempts to predict the
                 expected speedups, or slowdowns, of the candidate
                 speculative sections based on the estimation of the
                 combined runtime effects of various overheads, and
                 taking into account the scheduling restrictions of most
                 speculative execution environments. The model is based
                 on estimating the likely execution duration of threads
                 and considers all the possible permutations of these
                 threads. This model also produces a quantitative
                 estimate of the speedup, which is different from prior
                 heuristics that only qualitatively estimate the
                 benefits of speculative multithreaded execution. In
                 previous work, a limited version of the framework was
                 evaluated on a number of loops from a collection of
                 SPEC benchmarks that suffer mainly from load imbalance
                 and thread dispatch and commit overheads. In this work,
                 an extended framework is also evaluated on loops that
                 may suffer from data-dependence violations.
                 Experimental results show that prediction accuracy is
                 lower when loops with violations are included.
                 Nevertheless, accuracy is still very high for a static
                 model: the framework can identify, on average, 45\% of
                 the loops that cause slowdowns and, on average, 96\% of
                 the loops that lead to speedups; it predicts the
                 speedups or slowdowns with an error of less than 20\%
                 for an average of 28\% of the loops across the
                 benchmarks and with an error of less than 50\% for an
                 average of 80\% of the loops. Overall, the framework
                 often outperforms, by as much as 25\%, a naive approach
                 that attempts to speculatively parallelize all the
                 loops considered, and is able to curb the large
                 slowdowns caused in many cases by this naive
                 approach.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  keywords =     "speculative multithreading; speculative
                 parallelization; thread-level speculation",
}

@Article{Amme:2007:SBM,
  author =       "Wolfram Amme and Jeffery von Ronne and Michael Franz",
  title =        "{SSA}-based mobile code: {Implementation} and
                 empirical evaluation",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250733",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Although one might expect transportation formats based
                 on static single-assignment form (SSA) to yield faster
                 just-in-time compilation times than those based on
                 stack-based virtual machines, this claim has not
                 previously been validated, in practice. We attempt to
                 quantify the effect of using an SSA-based mobile code
                 representation by integrating support for a verifiable
                 SSA-based IR into Jikes RVM. Performance results,
                 measured with various optimizations and on both the
                 IA32 and PowerPC, show improvements in both compilation
                 time and code quality.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  keywords =     "SafeTSA; static single-assignment form; virtual
                 machines",
}

@Article{Li:2007:CCE,
  author =       "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
                 Yuanyuan Zhou",
  title =        "Cross-component energy management: {Joint} adaptation
                 of processor and memory",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275938",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Researchers have proposed the use of adaptation to
                 reduce the energy consumption of different hardware
                 components, such as the processor, memory, disk, and
                 display for general-purpose applications. Previous
                 algorithms to control these adaptations, however, have
                 focused on a single component. This work takes the
                 first step toward developing algorithms that can
                 jointly control adaptations in multiple interacting
                 components for general-purpose applications, with the
                 goal of minimizing the total energy consumed within a
                 specified performance loss. Specifically, we develop a
                 joint-adaptation algorithm for processor and memory
                 adaptations. We identify two properties that enable
                 per-component algorithms to be easily used in a
                 cross-component context---the algorithms' performance
                 impact must be guaranteed and composable. We then
                 modify a current processor and a memory algorithm to
                 obey these properties. This allows the cross-component
                 problem to be reduced to determine an appropriate
                 (energy-optimal) allocation of the target performance
                 loss (slack) between the two components. We develop
                 such an optimal slack allocation algorithm that
                 exploits the above properties. The result is an
                 efficient cross-component adaptation framework that
                 minimizes the total energy of the processor and memory
                 without exceeding the target performance loss, while
                 substantially leveraging current per-component
                 algorithms. Our experiments show that joint processor
                 and memory adaptation provides significantly more
                 energy savings than adapting either component alone;
                 intelligent slack distribution is specifically
                 effective for highly compute- or memory-intensive
                 applications; and the performance slowdown never
                 exceeds the specification.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  keywords =     "adaptive systems; control algorithms; energy
                 management; low-power design; memory; performance
                 guarantee; processor",
}

@Article{Gabor:2007:FES,
  author =       "Ron Gabor and Shlomo Weiss and Avi Mendelson",
  title =        "Fairness enforcement in switch on event
                 multithreading",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275939",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "The need to reduce power and complexity will increase
                 the interest in Switch On Event multithreading
                 (coarse-grained multithreading). Switch On Event
                 multithreading is a low-power and low-complexity
                 mechanism to improve processor throughput by switching
                 threads on execution stalls. Fairness may, however,
                 become a problem in a multithreaded processor. Unless
                 fairness is properly handled, some threads may starve
                 while others consume all of the processor cycles.
                 Heuristics that were devised in order to improve
                 fairness in simultaneous multithreading are not
                 applicable to Switch On Event multithreading. This
                 paper defines the fairness metric using the ratio of
                 the individual threads' speedups and shows how it can
                 be enforced in Switch On Event multithreading. Fairness
                 is controlled by forcing additional thread switch
                 points. These switch points are determined dynamically
                 by runtime estimation of the single threaded
                 performance of each of the individual threads. We
                 analyze the impact of the fairness enforcement
                 mechanism on aggregate IPC and weighted speedup. We
                 present simulation results of the performance of Switch
                 On Event multithreading. Switch On Event multithreading
                 achieves an average aggregate IPC increase of 26\% over
                 single thread and 12\% weighted speedup when no
                 fairness is enforced. In this case, a sixth of our runs
                 resulted in poor fairness in which one thread ran
                 extremely slowly (10 to 100 times slower than its
                 single-thread performance), while the other thread's
                 performance was hardly affected. By using the proposed
                 mechanism, we can guarantee fairness at different
                 levels of strictness and, in most cases, even improve
                 the weighted speedup.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  keywords =     "coarse-grained multithreading; fairness;
                 multithreading; performance; SOE; Switch on Event
                 multithreading; throughput; weighted speedup",
}

@Article{Andrade:2007:PAA,
  author =       "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Precise automatable analytical modeling of the cache
                 behavior of codes with indirections",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275940",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "The performance of memory hierarchies, in which caches
                 play an essential role, is critical in nowadays
                 general-purpose and embedded computing systems because
                 of the growing memory bottleneck problem.
                 Unfortunately, cache behavior is very unstable and
                 difficult to predict. This is particularly true in the
                 presence of irregular access patterns, which exhibit
                 little locality. Such patterns are very common, for
                 example, in applications in which pointers or
                 compressed sparse matrices give place to indirections.
                 Nevertheless, cache behavior in the presence of
                 irregular access patterns has not been widely studied.
                 In this paper we present an extension of a systematic
                 analytical modeling technique based on PMEs
                 (probabilistic miss equations), previously developed by
                 the authors, that allows the automated analysis of the
                 cache behavior for codes with irregular access patterns
                 resulting from indirections. The model generates very
                 accurate predictions despite the irregularities and has
                 very low computing requirements, being the first model
                 that gathers these desirable characteristics that can
                 automatically analyze this kind of codes. These
                 properties enable this model to help drive compiler
                 optimizations, as we show with an example.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  keywords =     "analytical modeling; irregular access patterns; memory
                 hierarchy; performance prediction",
}

@Article{Venstermans:2007:JOH,
  author =       "Kris Venstermans and Lieven Eeckhout and Koen De
                 Bosschere",
  title =        "{Java} object header elimination for reduced memory
                 consumption in 64-bit virtual machines",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275941",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Memory performance is an important design issue for
                 contemporary computer systems given the huge
                 processor/memory speed gap. This paper proposes a
                 space-efficient Java object model for reducing the
                 memory consumption of 64-bit Java virtual machines. We
                 completely eliminate the object header through typed
                 virtual addressing (TVA) or implicit typing. TVA
                 encodes the object type in the object's virtual address
                 by allocating all objects of a given type in a
                 contiguous memory segment. This allows for removing the
                 type information as well as the status field from the
                 object header. Whenever type and status information is
                 needed, masking is applied to the object's virtual
                 address for obtaining an offset into type and status
                 information structures. Unlike previous work on
                 implicit typing, we apply TVA to a selected number of
                 frequently allocated object types, hence, the name
                 selective TVA (STVA); this limits the amount of memory
                 fragmentation. In addition to applying STVA, we also
                 compress the type information block (TIB) pointers for
                 all objects that do not fall under TVA. We implement
                 the space-efficient Java object model in the 64-bit
                 version of the Jikes RVM on an AIX IBM platform and
                 compare its performance against the traditionally used
                 Java object model using a multitude of Java benchmarks.
                 We conclude that the space-efficient Java object model
                 reduces memory consumption by on average 15\% (and up
                 to 45\% for some benchmarks). About one-half the
                 reduction comes from TIB pointer compression; the other
                 one-half comes from STVA. In terms of performance, the
                 space-efficient object model generally does not affect
                 performance; however, for some benchmarks we observe
                 statistically significant performance speedups, up to
                 20\%.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  keywords =     "64-bit implementation; implicit typing; Java object
                 model; typed virtual addressing; Virtual machine",
}

@Article{Xiao:2007:VIS,
  author =       "Shu Xiao and Edmund M.-K. Lai",
  title =        "{VLIW} instruction scheduling for minimal power
                 variation",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275942",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "The focus of this paper is on the minimization of the
                 variation in power consumed by a VLIW processor during
                 the execution of a target program through instruction
                 scheduling. The problem is formulated as a
                 mixed-integer program (MIP) and a problem-specific
                 branch-and-bound algorithm has been developed to solve
                 it more efficiently than generic MIP solvers.
                 Simulation results based on the TMS320C6711 VLIW
                 digital signal processor using benchmarks from
                 Mediabench and Trimaran showed that over 40\% average
                 reduction in power variation can be achieved without
                 sacrificing execution speed of these benchmarks.
                 Computational requirements and convergence rates of our
                 algorithm are also analyzed.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  keywords =     "instruction scheduling; power variation reduction;
                 VLIW processors",
}

@Article{Tallam:2007:UCF,
  author =       "Sriraman Tallam and Rajiv Gupta",
  title =        "Unified control flow and data dependence traces",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275943",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "We describe the design, generation, and compression of
                 the extended whole program path (eWPP), representation
                 that not only captures the control flow history of a
                 program execution but also its data dependence history.
                 This representation is motivated by the observation
                 that, typically, a significant fraction of data
                 dependence history can be recovered from the control
                 flow trace. To capture the remainder of the data
                 dependence history, we introduce disambiguation checks
                 in the program whose control flow signatures capture
                 the results of the checks. The resulting extended
                 control flow trace enables the recovery of otherwise
                 irrecoverable data dependences. The code for the checks
                 is designed to minimize the increase in program
                 execution time and the extended control flow trace size
                 when compared to directly collecting control flow and
                 address traces. Our experiments show that compressed
                 eWPPs are only one-quarter of the size of combined
                 compressed control flow and address traces. However,
                 their collection incurs a 5{\times} increase in runtime
                 overhead relative to the overhead required for directly
                 collecting the control flow and address traces,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  keywords =     "address trace; control flow trace; dynamic data
                 dependence trace; profiling",
}

@Article{Ipek:2008:EAD,
  author =       "Engin Ipek and Sally A. McKee and Karan Singh and Rich
                 Caruana and Bronis R. de Supinski and Martin Schulz",
  title =        "Efficient architectural design space exploration via
                 predictive modeling",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328196",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Efficiently exploring exponential-size architectural
                 design spaces with many interacting parameters remains
                 an open problem: the sheer number of experiments
                 required renders detailed simulation intractable. We
                 attack this via an automated approach that builds
                 accurate predictive models. We simulate sampled points,
                 using results to teach our models the function
                 describing relationships among design parameters. The
                 models can be queried and are very fast, enabling
                 efficient design tradeoff discovery. We validate our
                 approach via two uniprocessor sensitivity studies,
                 predicting IPC with only 1--2\% error. In an
                 experimental study using the approach, training on 1\%
                 of a 250-K-point CMP design space allows our models to
                 predict performance with only 4--5\% error. Our
                 predictive modeling combines well with techniques that
                 reduce the time taken by each simulation experiment,
                 achieving net time savings of three-four orders of
                 magnitude.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  keywords =     "artificial neural networks; design space exploration;
                 performance prediction; sensitivity studies",
}

@Article{Shi:2008:VMS,
  author =       "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
                 Gregg",
  title =        "Virtual machine showdown: {Stack} versus registers",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328197",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Virtual machines (VMs) enable the distribution of
                 programs in an architecture-neutral format, which can
                 easily be interpreted or compiled. A long-running
                 question in the design of VMs is whether a stack
                 architecture or register architecture can be
                 implemented more efficiently with an interpreter. We
                 extend existing work on comparing virtual stack and
                 virtual register architectures in three ways. First,
                 our translation from stack to register code and
                 optimization are much more sophisticated. The result is
                 that we eliminate an average of more than 46\% of
                 executed VM instructions, with the bytecode size of the
                 register machine being only 26\% larger than that of
                 the corresponding stack one. Second, we present a fully
                 functional virtual-register implementation of the Java
                 virtual machine (JVM), which supports Intel, AMD64,
                 PowerPC and Alpha processors. This register VM supports
                 inline-threaded, direct-threaded, token-threaded, and
                 switch dispatch. Third, we present experimental results
                 on a range of additional optimizations such as register
                 allocation and elimination of redundant heap loads. On
                 the AMD64 architecture the register machine using
                 switch dispatch achieves an average speedup of 1.48
                 over the corresponding stack machine. Even using the
                 more efficient inline-threaded dispatch, the register
                 VM achieves a speedup of 1.15 over the equivalent
                 stack-based VM.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  keywords =     "interpreter; register architecture; stack
                 architecture; virtual machine",
}

@Article{Yan:2008:EVR,
  author =       "Jun Yan and Wei Zhang",
  title =        "Exploiting virtual registers to reduce pressure on
                 real registers",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328198",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "It is well known that a large fraction of variables
                 are short-lived. This paper proposes a novel approach
                 to exploiting this fact to reduce the register pressure
                 for pipelined processors with data-forwarding network.
                 The idea is that the compiler can allocate virtual
                 registers (i.e., place holders to identify dependences
                 among instructions) to short-lived variables, which do
                 not need to be stored to physical storage locations. As
                 a result, real registers (i.e., physically existed
                 registers) can be reserved for long-lived variables for
                 mitigating the register pressure and decreasing the
                 register spills, leading to performance improvement. In
                 this paper, we develop the architectural and compiler
                 support for exploiting virtual registers for statically
                 scheduled processors. Our experimental results show
                 that virtual registers are very effective at reducing
                 the register spills, which, in many cases, can achieve
                 the performance close to the processor with twice
                 number of real registers. Our results also indicate
                 that, for some applications, using 24 virtual, in
                 addition to 8 real registers, can attain even higher
                 performance than that of 16 real without any virtual
                 registers.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  keywords =     "data forwarding; register allocation; register file;
                 short-lived variables; virtual register",
}

@Article{Yu:2008:OCL,
  author =       "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
  title =        "Object co-location and memory reuse for {Java}
                 programs",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328199",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "We introduce a new memory management system, STEMA,
                 which can improve the execution time of Java programs.
                 STEMA detects prolific types on-the-fly and co-locates
                 their objects in a special memory space which supports
                 reuse of memory. We argue and show that memory reuse
                 and co-location of prolific objects can result in
                 improved cache locality, reduced memory fragmentation,
                 reduced GC time, and faster object allocation. We
                 evaluate STEMA using 16 benchmarks. Experimental
                 results show that STEMA performs 2.7\%, 4.0\%, and
                 8.2\% on average better than MarkSweep, CopyMS, and
                 SemiSpace.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  keywords =     "garbage collector; Java; memory allocator; memory
                 reuse; mutator; object co-location",
}

@Article{Zhang:2008:RCM,
  author =       "Chuanjun Zhang",
  title =        "Reducing cache misses through programmable decoders",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328200",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Level-one caches normally reside on a processor's
                 critical path, which determines clock frequency.
                 Therefore, fast access to level-one cache is important.
                 Direct-mapped caches exhibit faster access time, but
                 poor hit rates, compared with same sized
                 set-associative caches because of nonuniform accesses
                 to the cache sets. The nonuniform accesses generate
                 more cache misses in some sets, while other sets are
                 underutilized. We propose to increase the decoder
                 length and, hence, reduce the accesses to heavily used
                 sets without dynamically detecting the cache set usage
                 information. We increase the access to the
                 underutilized cache sets by incorporating a replacement
                 policy into the cache design using programmable
                 decoders. On average, the proposed techniques achieve
                 as low a miss rate as a traditional 4-way cache on all
                 26 SPEC2K benchmarks for the instruction and data
                 caches, respectively. This translates into an average
                 IPC improvement of 21.5 and 42.4\% for SPEC2K integer
                 and floating-point benchmarks, respectively. The
                 B-Cache consumes 10.5\% more power per access, but
                 exhibits a 12\% total memory access-related energy
                 savings as a result of the miss rate reductions, and,
                 hence, the reduction to applications' execution time.
                 Compared with previous techniques that aim at reducing
                 the miss rate of direct-mapped caches, our technique
                 requires only one cycle to access all cache hits and
                 has the same access time of a direct-mapped cache.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  keywords =     "cache; dynamic optimization; low power",
}

@Article{Golander:2008:HMP,
  author =       "Amit Golander and Shlomo Weiss",
  title =        "Hiding the misprediction penalty of a
                 resource-efficient high-performance processor",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328201",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Misprediction is a major obstacle for increasing
                 speculative out-of-order processors performance.
                 Performance degradation depends on both the number of
                 misprediction events and the recovery time associated
                 with each one of them. In recent years a few checkpoint
                 based microarchitectures have been proposed. In
                 comparison with ROB-based processors, checkpoint
                 processors are scalable and highly resource efficient.
                 Unfortunately, in these proposals the misprediction
                 recovery time is proportional to the instruction queue
                 size.\par

                 In this paper we analyze methods to reduce the
                 misprediction recovery time. We propose a new register
                 file management scheme and techniques to selectively
                 flush the instruction queue and the load store queue,
                 and to isolate deeply pipelined execution units. The
                 result is a novel checkpoint processor with Constant
                 misprediction RollBack time (CRB). We further present a
                 streamlined, cost-efficient solution, which saves
                 complexity at the price of slightly lower
                 performance.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  keywords =     "checkpoints; misprediction; out-of-order execution;
                 rollback; scalable architecture",
}

@Article{Calder:2008:E,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Editorial",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369397",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  acknowledgement = ack-nhfb,
  articleno =    "1",
}

@Article{Mysore:2008:FIP,
  author =       "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
                 and Timothy Sherwood and Nisheeth Shrivastava and
                 Subhash Suri",
  title =        "Formulating and implementing profiling over adaptive
                 ranges",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369398",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Modern computer systems are called on to deal with
                 billions of events every second, whether they are
                 executed instructions, accessed memory locations, or
                 forwarded packets. This presents a serious challenge to
                 those who seek to quantify, analyze, or optimize such
                 systems, because important trends and behaviors may
                 easily be lost in a sea of data. We present
                 range-adaptive profiling (RAP) as a new and
                 general-purpose profiling method capable of
                 hierarchically efficiently classifying streams of data
                 in hardware. Through the use of RAP, events in an input
                 stream are dynamically classified into increasingly
                 precise categories, based on the frequency with which
                 they occur. The more important a class, or range of
                 events, the more precisely it is quantified. Despite
                 the dynamic nature of our technique, we build upon
                 tight theoretic bounds covering both worst-case error,
                 as well as the required memory. In the limit, it is
                 known that error and the memory bounds can be
                 independent of the stream size and grow only linearly
                 with the level of precision desired. Significantly, we
                 expose the critical constants in these algorithms and
                 through careful engineering, algorithm redesign, and
                 use of heuristics, we show how a high-performance
                 profile system can be implemented for range-adaptive
                 profiling. RAP can be used on various profiles, such as
                 PCs, load values, and memory addresses, and has a broad
                 range of uses, from hot-region profiling to quantifying
                 cache miss value locality. We propose two methods of
                 implementation of RAP, one in software and the other
                 with specialized hardware, for which we also describe
                 our prototype FPGA implementation. We show that with
                 just 8KB of memory, range profiles can be gathered with
                 an average accuracy of 98\%.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  keywords =     "profiling hardware; range adaptive; value locality",
}

@Article{Zhai:2008:CHS,
  author =       "Antonia Zhai and J. Gregory Steffan and Christopher B.
                 Colohan and Todd C. Mowry",
  title =        "Compiler and hardware support for reducing the
                 synchronization of speculative threads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369399",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Thread-level speculation (TLS) allows us to
                 automatically parallelize general-purpose programs by
                 supporting parallel execution of threads that might not
                 actually be independent. In this article, we focus on
                 one important limitation of program performance under
                 TLS, which stalls as a result of synchronizing and
                 forwarding scalar values between speculative threads
                 that would otherwise cause frequent data dependences
                 and, hence, failed speculation. Using SPECint
                 benchmarks that have been automatically transformed by
                 our compiler to exploit TLS, we present, evaluate in
                 detail, and compare both compiler and hardware
                 techniques for improving the communication of scalar
                 values. We find that through our dataflow algorithms
                 for three increasingly aggressive instruction
                 scheduling techniques, the compiler can drastically
                 reduce the critical forwarding path introduced by the
                 synchronization and forwarding of scalar values. We
                 also show that hardware techniques for reducing
                 synchronization can be complementary to compiler
                 scheduling, but that the additional performance
                 benefits are minimal and are generally not worth the
                 cost.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  keywords =     "automatic parallelization; chip-multiprocessing;
                 instruction scheduling; thread-level speculation",
}

@Article{Winter:2008:ATN,
  author =       "Jonathan A. Winter and David H. Albonesi",
  title =        "Addressing thermal nonuniformity in {SMT} workloads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369400",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "We explore DTM techniques within the context of
                 uniform and nonuniform SMT workloads. While DVS is
                 suitable for addressing workloads with uniformly high
                 temperatures, for nonuniform workloads, performance
                 loss occurs because of the slowdown of the cooler
                 thread. To address this, we propose and evaluate DTM
                 mechanisms that exploit the steering-based thread
                 management mechanisms inherent in a clustered SMT
                 architecture. We show that in contrast to DVS, which
                 operates globally, our techniques are more effective at
                 controlling temperature for nonuniform workloads.
                 Furthermore, we devise a DTM technique that combines
                 steering and DVS to achieve consistently good
                 performance across all workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  keywords =     "adaptive microarchitectures; clustered
                 microarchitectures; dynamic thermal management; dynamic
                 voltage scaling; simultaneous multithreading",
}

@Article{Shahbahrami:2008:VES,
  author =       "Asadollah Shahbahrami and Ben Juurlink and Stamatis
                 Vassiliadis",
  title =        "Versatility of extended subwords and the matrix
                 register file",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369401",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Extended subwords and the matrix register file (MRF)
                 are two micro architectural techniques that address
                 some of the limitations of existing SIMD architectures.
                 Extended subwords are wider than the data stored in
                 memory. Specifically, for every byte of data stored in
                 memory, there are four extra bits in the media register
                 file. This avoids the need for data-type conversion
                 instructions. The MRF is a register file organization
                 that provides both conventional row-wise, as well as
                 column-wise, access to the register file. In other
                 words, it allows to view the register file as a matrix
                 in which corresponding subwords in different registers
                 corresponds to a column of the matrix. It was
                 introduced to accelerate matrix transposition which is
                 a very common operation in multimedia applications. In
                 this paper, we show that the MRF is very versatile,
                 since it can also be used for other permutations than
                 matrix transposition. Specifically, it is shown how it
                 can be used to provide efficient access to strided
                 data, as is needed in, e.g., color space conversion.
                 Furthermore, it is shown that special-purpose
                 instructions (SPIs), such as the sum-of-absolute
                 differences (SAD) instruction, have limited usefulness
                 when extended subwords and a few general SIMD
                 instructions that we propose are supported, for the
                 following reasons. First, when extended subwords are
                 supported, the SAD instruction provides only a
                 relatively small performance improvement. Second, the
                 SAD instruction processes 8-bit subwords only, which is
                 not sufficient for quarter-pixel resolution nor for
                 cost functions used in image and video retrieval.
                 Results obtained by extending the SimpleScalar toolset
                 show that the proposed techniques provide a speedup of
                 up to 3.00 over the MMX architecture. The results also
                 show that using, at most, 13 extra media registers
                 yields an additional performance improvement ranging
                 from 1.38 to 1.57.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  keywords =     "multimedia standards; SIMD architectures; SIMD
                 programming",
}

@Article{Guo:2008:EHC,
  author =       "Zhi Guo and Walid Najjar and Betul Buyukkurt",
  title =        "Efficient hardware code generation for {FPGAs}",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369402",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "The wider acceptance of FPGAs as a computing device
                 requires a higher level of programming abstraction.
                 ROCCC is an optimizing C to HDL compiler. We describe
                 the code generation approach in ROCCC. The smart buffer
                 is a component that reuses input data between adjacent
                 iterations. It significantly improves the performance
                 of the circuit and simplifies loop control. The
                 ROCCC-generated datapath can execute one loop iteration
                 per clock cycle when there is no loop dependency or
                 there is only scalar recurrence variable dependency.
                 ROCCC's approach to supporting while-loops operating on
                 scalars makes the compiler able to move scalar
                 iterative computation into hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  keywords =     "data reuse; FPGA; high-level synthesis; reconfigurable
                 computing; VHDL",
}

@Article{Kotzmann:2008:DJH,
  author =       "Thomas Kotzmann and Christian Wimmer and Hanspeter
                 M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
                 Russell and David Cox",
  title =        "Design of the {Java HotSpot\TM} client compiler for
                 {Java 6}",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1370017",
  ISSN =         "1544-3566",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
                 ships with a redesigned version of the client
                 just-in-time compiler that includes several research
                 results of the last years. The client compiler is at
                 the heart of the VM configuration used by default for
                 interactive desktop applications. For such
                 applications, low startup and pause times are more
                 important than peak performance. This paper outlines
                 the new architecture of the client compiler and shows
                 how it interacts with the VM. It presents the
                 intermediate representation that now uses static
                 single-assignment (SSA) form and the linear scan
                 algorithm for global register allocation. Efficient
                 support for exception handling and deoptimization
                 fulfills the demands that are imposed by the dynamic
                 features of the Java programming language. The
                 evaluation shows that the new client compiler generates
                 better code in less time. The popular SPECjvm98
                 benchmark suite is executed 45\% faster, while the
                 compilation speed is also up to 40\% better. This
                 indicates that a carefully selected set of global
                 optimizations can also be integrated in just-in-time
                 compilers that focus on compilation speed and not on
                 peak performance. In addition, the paper presents the
                 impact of several optimizations on execution and
                 compilation speed. As the source code is freely
                 available, the Java HotSpot{\TM} VM and the client
                 compiler are the ideal basis for experiments with new
                 feedback-directed optimizations in a production-level
                 Java just-in-time compiler. The paper outlines research
                 projects that add fast algorithms for escape analysis,
                 automatic object inlining, and array bounds check
                 elimination.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  keywords =     "compiler; deoptimization; intermediate representation;
                 Java; just-in-time compilation; optimization; register
                 allocation",
}

@Article{Rangan:2008:PSD,
  author =       "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
                 and David I. August",
  title =        "Performance scalability of decoupled software
                 pipelining",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1400112.1400113",
  ISSN =         "1544-3566",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/",
  abstract =     "Any successful solution to using multicore processors
                 to scale general-purpose program performance will have
                 to contend with rising intercore communication costs
                 while exposing coarse-grained parallelism. Recently
                 proposed pipelined multithreading (PMT) techniques have
                 been demonstrated to have general-purpose applicability
                 and are also able to effectively tolerate inter-core
                 latencies through pipelined interthread communication.
                 T